diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 0d8bf03252..2801519668 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -286,13 +286,14 @@ namespace vk "#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n" "#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n" "\n" + "%md" "void main()\n" "{\n" " uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);" " uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n" " uint index = invocation_id * KERNEL_SIZE;\n" " uint value;\n" - " %vars" + "%vars" "\n"; const auto parameters_size = align(push_constants_size, 16) / 16; @@ -302,6 +303,7 @@ namespace vk { "%ks", std::to_string(kernel_size) }, { "%vars", variables }, { "%f", function_name }, + { "%md", method_declarations }, { "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" }, }; @@ -458,6 +460,7 @@ namespace vk u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 }; set_parameters(cmd, parameters, 4); + verify(HERE), stencil_offset > data_offset; m_ssbo_length = stencil_offset + (data_length / 4) - data_offset; cs_shuffle_base::run(cmd, data, data_length, data_offset); } @@ -588,6 +591,132 @@ namespace vk } }; + template + struct cs_fconvert_task : cs_shuffle_base + { + u32 m_ssbo_length = 0; + + void declare_f16_expansion() + { + method_declarations += + "uvec2 unpack_e4m12_pack16(const in uint value)\n" + "{\n" + " uvec2 result = uvec2(bitfieldExtract(value, 0, 16), bitfieldExtract(value, 16, 16));\n" + " result <<= 11;\n" + " result += (120 << 23);\n" + " return result;\n" + "}\n\n"; + } + + void declare_f16_contraction() + { + method_declarations += + "uint pack_e4m12_pack16(const in uvec2 value)\n" + "{\n" + " uvec2 result = (value - (120 << 23)) >> 11;\n" + " return (result.x & 0xFFFF) | (result.y << 16);\n" + "}\n\n"; + } + + cs_fconvert_task() + { + use_push_constants = true; + push_constants_size = 16; + + variables = + " uint block_length = params[0].x >> 2;\n" + " uint in_offset = params[0].y >> 2;\n" + " uint out_offset = params[0].z >> 2;\n" + " uvec4 tmp;\n"; + + work_kernel += + " if (index >= block_length)\n" + " return;\n"; + + if constexpr (sizeof(From) == 4) + { + static_assert(sizeof(To) == 2); + declare_f16_contraction(); + + work_kernel += + " const uint src_offset = (index * 2) + in_offset;\n" + " const uint dst_offset = index + out_offset;\n" + " tmp.x = data[src_offset];\n" + " tmp.y = data[src_offset + 1];\n"; + + if constexpr (_SwapSrc) + { + work_kernel += + " tmp = bswap_u32(tmp);\n"; + } + + // Convert + work_kernel += " tmp.z = pack_e4m12_pack16(tmp);\n"; + + if constexpr (_SwapDst) + { + work_kernel += " tmp.z = bswap_u16(tmp.z);\n"; + } + + work_kernel += " data[dst_offset] = tmp.z;\n"; + } + else + { + static_assert(sizeof(To) == 4); + declare_f16_expansion(); + + work_kernel += + " const uint src_offset = index + in_offset;\n" + " const uint dst_offset = (index * 2) + out_offset;\n" + " tmp.x = data[src_offset];\n"; + + if constexpr (_SwapSrc) + { + work_kernel += + " tmp.x = bswap_u16(tmp.x);\n"; + } + + // Convert + work_kernel += " tmp.yz = unpack_e4m12_pack16(tmp.x);\n"; + + if constexpr (_SwapDst) + { + work_kernel += " tmp.yz = bswap_u16(tmp.yz);\n"; + } + + work_kernel += + " data[dst_offset] = tmp.y;\n" + " data[dst_offset + 1] = tmp.z;\n"; + } + + cs_shuffle_base::build(""); + } + + void bind_resources() override + { + m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + } + + void run(VkCommandBuffer cmd, const vk::buffer* data, u32 src_offset, u32 src_length, u32 dst_offset) + { + u32 data_offset; + if (src_offset > dst_offset) + { + m_ssbo_length = (src_offset + src_length) - dst_offset; + data_offset = dst_offset; + } + else + { + m_ssbo_length = (dst_offset - src_offset) + (src_length / sizeof(From)) * sizeof(To); + data_offset = src_offset; + } + + u32 parameters[4] = { src_length, src_offset - data_offset, dst_offset - data_offset, 0 }; + set_parameters(cmd, parameters, 4); + cs_shuffle_base::run(cmd, data, src_length, data_offset); + } + }; + // Reverse morton-order block arrangement struct cs_deswizzle_base : compute_task { diff --git a/rpcs3/Emu/RSX/VK/VKFormats.cpp b/rpcs3/Emu/RSX/VK/VKFormats.cpp index 55f8e61eb3..9c9b73bc7a 100644 --- a/rpcs3/Emu/RSX/VK/VKFormats.cpp +++ b/rpcs3/Emu/RSX/VK/VKFormats.cpp @@ -335,6 +335,7 @@ namespace vk case VK_FORMAT_R32G32B32A32_SFLOAT: return 16; case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D32_SFLOAT: return 2; case VK_FORMAT_D32_SFLOAT_S8_UINT: //TODO: Translate to D24S8 case VK_FORMAT_D24_UNORM_S8_UINT: @@ -396,6 +397,7 @@ namespace vk return{ 4, 1 }; //Depth case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D32_SFLOAT: return{ 2, 1 }; case VK_FORMAT_D32_SFLOAT_S8_UINT: case VK_FORMAT_D24_UNORM_S8_UINT: diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 5a479cefca..d68b89c584 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -80,7 +80,47 @@ namespace vk } case VK_FORMAT_D32_SFLOAT: { - fmt::throw_exception("Unsupported transfer (D16_FLOAT"); + rsx_log.error("Unsupported transfer (D16_FLOAT)"); // Need real games to test this. + verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT; + + const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width; + const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height; + const u32 packed32_length = out_w * out_h * 4; + const u32 packed16_length = out_w * out_h * 2; + + const auto allocation_end = region.bufferOffset + packed32_length + packed16_length; + verify(HERE), dst->size() >= allocation_end; + + const auto data_offset = u32(region.bufferOffset); + const auto z32_offset = align(data_offset + packed16_length, 256); + + // 1. Copy the depth to buffer + VkBufferImageCopy region2; + region2 = region; + region2.bufferOffset = z32_offset; + vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, ®ion2); + + // 2. Pre-compute barrier + vk::insert_buffer_memory_barrier(cmd, dst->value, z32_offset, packed32_length, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + // 3. Do conversion with byteswap [D32->D16F] + if (!swap_bytes) [[likely]] + { + auto job = vk::get_compute_task>(); + job->run(cmd, dst, z32_offset, packed32_length, data_offset); + } + else + { + auto job = vk::get_compute_task>(); + job->run(cmd, dst, z32_offset, packed32_length, data_offset); + } + + // 4. Post-compute barrier + vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed16_length, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); break; } case VK_FORMAT_D24_UNORM_S8_UINT: @@ -177,7 +217,38 @@ namespace vk } case VK_FORMAT_D32_SFLOAT: { - fmt::throw_exception("Unsupported transfer (D16_FLOAT"); + rsx_log.error("Unsupported transfer (D16_FLOAT)"); + verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT; + + const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width; + const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height; + const u32 packed32_length = out_w * out_h * 4; + const u32 packed16_length = out_w * out_h * 2; + + const auto allocation_end = region.bufferOffset + packed32_length + packed16_length; + verify(HERE), src->size() >= allocation_end; + + const auto data_offset = u32(region.bufferOffset); + const auto z32_offset = align(data_offset + packed16_length, 256); + + // 1. Pre-compute barrier + vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + // 2. Do conversion with byteswap [D16F->D32F] + auto job = vk::get_compute_task>(); + job->run(cmd, src, data_offset, packed16_length, z32_offset); + + // 4. Post-compute barrier + vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + + // 5. Copy the depth data to image + VkBufferImageCopy region2 = region; + region2.bufferOffset = z32_offset; + vkCmdCopyBufferToImage(cmd, src->value, dst->value, dst->current_layout, 1, ®ion2); break; } case VK_FORMAT_D24_UNORM_S8_UINT: @@ -770,6 +841,7 @@ namespace vk const std::vector& subresource_layout, int format, bool is_swizzled, u16 mipmap_count, VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align) { + const bool requires_depth_processing = (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || (format == CELL_GCM_TEXTURE_DEPTH16_FLOAT); u32 block_in_pixel = rsx::get_format_block_size_in_texel(format); u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); @@ -842,7 +914,7 @@ namespace vk copy_info.imageSubresource.mipLevel = layout.level; copy_info.bufferRowLength = std::max(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel); - if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) + if (opt.require_swap || opt.require_deswizzle || requires_depth_processing) { if (!scratch_buf) { @@ -871,7 +943,7 @@ namespace vk } } - if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) + if (opt.require_swap || opt.require_deswizzle || requires_depth_processing) { verify(HERE), scratch_buf; vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); @@ -902,7 +974,7 @@ namespace vk } // CopyBufferToImage routines - if (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) + if (requires_depth_processing) { // Upload in reverse to avoid polluting data in lower space for (auto rIt = copy_regions.crbegin(); rIt != copy_regions.crend(); ++rIt)