vk: Improve D16F handling

- Adds upload and download routines. Mostly untested, which is why the error message exists
2024-11-25 12:12:50 +01:00 · 2020-08-29 17:06:12 +03:00 · 2020-08-29 17:06:12 +03:00 · af9e217fa4
commit af9e217fa4
parent 9a51f22265
3 changed files with 209 additions and 6 deletions
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@ -286,13 +286,14 @@ namespace vk
 				"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
 				"#define f32_to_d24x8_swapped(bits)   d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
 				"\n"
+				"%md"
 				"void main()\n"
 				"{\n"
 				"	uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
 				"	uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
 				"	uint index = invocation_id * KERNEL_SIZE;\n"
 				"	uint value;\n"
-				"	%vars"
+				"%vars"
 				"\n";

 			const auto parameters_size = align(push_constants_size, 16) / 16;
@ -302,6 +303,7 @@ namespace vk
 				{ "%ks", std::to_string(kernel_size) },
 				{ "%vars", variables },
 				{ "%f", function_name },
+				{ "%md", method_declarations },
 				{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
 			};

@ -458,6 +460,7 @@ namespace vk
 			u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
 			set_parameters(cmd, parameters, 4);

+			verify(HERE), stencil_offset > data_offset;
 			m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
 			cs_shuffle_base::run(cmd, data, data_length, data_offset);
 		}
@ -588,6 +591,132 @@ namespace vk
 		}
 	};

+	template<typename To, typename From, bool _SwapSrc = false, bool _SwapDst = false>
+	struct cs_fconvert_task : cs_shuffle_base
+	{
+		u32 m_ssbo_length = 0;
+
+		void declare_f16_expansion()
+		{
+			method_declarations +=
+				"uvec2 unpack_e4m12_pack16(const in uint value)\n"
+				"{\n"
+				"	uvec2 result = uvec2(bitfieldExtract(value, 0, 16), bitfieldExtract(value, 16, 16));\n"
+				"	result <<= 11;\n"
+				"	result += (120 << 23);\n"
+				"	return result;\n"
+				"}\n\n";
+		}
+
+		void declare_f16_contraction()
+		{
+			method_declarations +=
+				"uint pack_e4m12_pack16(const in uvec2 value)\n"
+				"{\n"
+				"	uvec2 result = (value - (120 << 23)) >> 11;\n"
+				"	return (result.x & 0xFFFF) | (result.y << 16);\n"
+				"}\n\n";
+		}
+
+		cs_fconvert_task()
+		{
+			use_push_constants = true;
+			push_constants_size = 16;
+
+			variables =
+				"	uint block_length = params[0].x >> 2;\n"
+				"	uint in_offset = params[0].y >> 2;\n"
+				"	uint out_offset = params[0].z >> 2;\n"
+				"	uvec4 tmp;\n";
+
+			work_kernel +=
+				"		if (index >= block_length)\n"
+				"			return;\n";
+
+			if constexpr (sizeof(From) == 4)
+			{
+				static_assert(sizeof(To) == 2);
+				declare_f16_contraction();
+
+				work_kernel +=
+					"		const uint src_offset = (index * 2) + in_offset;\n"
+					"		const uint dst_offset = index + out_offset;\n"
+					"		tmp.x = data[src_offset];\n"
+					"		tmp.y = data[src_offset + 1];\n";
+
+				if constexpr (_SwapSrc)
+				{
+					work_kernel +=
+						"		tmp = bswap_u32(tmp);\n";
+				}
+
+				// Convert
+				work_kernel += "		tmp.z = pack_e4m12_pack16(tmp);\n";
+
+				if constexpr (_SwapDst)
+				{
+					work_kernel += "		tmp.z = bswap_u16(tmp.z);\n";
+				}
+
+				work_kernel += "		data[dst_offset] = tmp.z;\n";
+			}
+			else
+			{
+				static_assert(sizeof(To) == 4);
+				declare_f16_expansion();
+
+				work_kernel +=
+					"		const uint src_offset = index + in_offset;\n"
+					"		const uint dst_offset = (index * 2) + out_offset;\n"
+					"		tmp.x = data[src_offset];\n";
+
+				if constexpr (_SwapSrc)
+				{
+					work_kernel +=
+						"		tmp.x = bswap_u16(tmp.x);\n";
+				}
+
+				// Convert
+				work_kernel += "		tmp.yz = unpack_e4m12_pack16(tmp.x);\n";
+
+				if constexpr (_SwapDst)
+				{
+					work_kernel += "		tmp.yz = bswap_u16(tmp.yz);\n";
+				}
+
+				work_kernel +=
+					"		data[dst_offset] = tmp.y;\n"
+					"		data[dst_offset + 1] = tmp.z;\n";
+			}
+
+			cs_shuffle_base::build("");
+		}
+
+		void bind_resources() override
+		{
+			m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
+		}
+
+		void run(VkCommandBuffer cmd, const vk::buffer* data, u32 src_offset, u32 src_length, u32 dst_offset)
+		{
+			u32 data_offset;
+			if (src_offset > dst_offset)
+			{
+				m_ssbo_length = (src_offset + src_length) - dst_offset;
+				data_offset = dst_offset;
+			}
+			else
+			{
+				m_ssbo_length = (dst_offset - src_offset) + (src_length / sizeof(From)) * sizeof(To);
+				data_offset = src_offset;
+			}
+
+			u32 parameters[4] = { src_length, src_offset - data_offset, dst_offset - data_offset, 0 };
+			set_parameters(cmd, parameters, 4);
+			cs_shuffle_base::run(cmd, data, src_length, data_offset);
+		}
+	};
+
 	// Reverse morton-order block arrangement
 	struct cs_deswizzle_base : compute_task
 	{
--- a/rpcs3/Emu/RSX/VK/VKFormats.cpp
+++ b/rpcs3/Emu/RSX/VK/VKFormats.cpp
@ -335,6 +335,7 @@ namespace vk
 		case VK_FORMAT_R32G32B32A32_SFLOAT:
 			return 16;
 		case VK_FORMAT_D16_UNORM:
+		case VK_FORMAT_D32_SFLOAT:
 			return 2;
 		case VK_FORMAT_D32_SFLOAT_S8_UINT: //TODO: Translate to D24S8
 		case VK_FORMAT_D24_UNORM_S8_UINT:
@ -396,6 +397,7 @@ namespace vk
 			return{ 4, 1 };
 		//Depth
 		case VK_FORMAT_D16_UNORM:
+		case VK_FORMAT_D32_SFLOAT:
 			return{ 2, 1 };
 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
 		case VK_FORMAT_D24_UNORM_S8_UINT:
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -80,7 +80,47 @@ namespace vk
 		}
 		case VK_FORMAT_D32_SFLOAT:
 		{
-			fmt::throw_exception("Unsupported transfer (D16_FLOAT");
+			rsx_log.error("Unsupported transfer (D16_FLOAT)"); // Need real games to test this.
+			verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT;
+
+			const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width;
+			const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height;
+			const u32 packed32_length = out_w * out_h * 4;
+			const u32 packed16_length = out_w * out_h * 2;
+
+			const auto allocation_end = region.bufferOffset + packed32_length + packed16_length;
+			verify(HERE), dst->size() >= allocation_end;
+
+			const auto data_offset = u32(region.bufferOffset);
+			const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
+
+			// 1. Copy the depth to buffer
+			VkBufferImageCopy region2;
+			region2 = region;
+			region2.bufferOffset = z32_offset;
+			vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, &region2);
+
+			// 2. Pre-compute barrier
+			vk::insert_buffer_memory_barrier(cmd, dst->value, z32_offset, packed32_length,
+				VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+				VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+
+			// 3. Do conversion with byteswap [D32->D16F]
+			if (!swap_bytes) [[likely]]
+			{
+				auto job = vk::get_compute_task<vk::cs_fconvert_task<u16, u32>>();
+				job->run(cmd, dst, z32_offset, packed32_length, data_offset);
+			}
+			else
+			{
+				auto job = vk::get_compute_task<vk::cs_fconvert_task<u16, u32, false, true>>();
+				job->run(cmd, dst, z32_offset, packed32_length, data_offset);
+			}
+
+			// 4. Post-compute barrier
+			vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed16_length,
+				VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+				VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
 			break;
 		}
 		case VK_FORMAT_D24_UNORM_S8_UINT:
@ -177,7 +217,38 @@ namespace vk
 		}
 		case VK_FORMAT_D32_SFLOAT:
 		{
-			fmt::throw_exception("Unsupported transfer (D16_FLOAT");
+			rsx_log.error("Unsupported transfer (D16_FLOAT)");
+			verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT;
+
+			const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width;
+			const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height;
+			const u32 packed32_length = out_w * out_h * 4;
+			const u32 packed16_length = out_w * out_h * 2;
+
+			const auto allocation_end = region.bufferOffset + packed32_length + packed16_length;
+			verify(HERE), src->size() >= allocation_end;
+
+			const auto data_offset = u32(region.bufferOffset);
+			const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
+
+			// 1. Pre-compute barrier
+			vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
+				VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+				VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+
+			// 2. Do conversion with byteswap [D16F->D32F]
+			auto job = vk::get_compute_task<vk::cs_fconvert_task<u32, u16>>();
+			job->run(cmd, src, data_offset, packed16_length, z32_offset);
+
+			// 4. Post-compute barrier
+			vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
+				VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+				VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+
+			// 5. Copy the depth data to image
+			VkBufferImageCopy region2 = region;
+			region2.bufferOffset = z32_offset;
+			vkCmdCopyBufferToImage(cmd, src->value, dst->value, dst->current_layout, 1, &region2);
 			break;
 		}
 		case VK_FORMAT_D24_UNORM_S8_UINT:
@ -770,6 +841,7 @@ namespace vk
 		const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
 		VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
 	{
+		const bool requires_depth_processing = (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || (format == CELL_GCM_TEXTURE_DEPTH16_FLOAT);
 		u32 block_in_pixel = rsx::get_format_block_size_in_texel(format);
 		u8  block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);

@ -842,7 +914,7 @@ namespace vk
 			copy_info.imageSubresource.mipLevel = layout.level;
 			copy_info.bufferRowLength = std::max<u32>(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel);

-			if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
+			if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
 			{
 				if (!scratch_buf)
 				{
@ -871,7 +943,7 @@ namespace vk
 			}
 		}

-		if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
+		if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
 		{
 			verify(HERE), scratch_buf;
 			vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
@ -902,7 +974,7 @@ namespace vk
 		}

 		// CopyBufferToImage routines
-		if (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
+		if (requires_depth_processing)
 		{
 			// Upload in reverse to avoid polluting data in lower space
 			for (auto rIt = copy_regions.crbegin(); rIt != copy_regions.crend(); ++rIt)