1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 02:32:36 +01:00

vk: Improve D16F handling

- Adds upload and download routines. Mostly untested, which is why the error message exists
This commit is contained in:
kd-11 2020-08-29 17:06:12 +03:00 committed by kd-11
parent 9a51f22265
commit af9e217fa4
3 changed files with 209 additions and 6 deletions

View File

@ -286,13 +286,14 @@ namespace vk
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
"\n"
"%md"
"void main()\n"
"{\n"
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
" uint index = invocation_id * KERNEL_SIZE;\n"
" uint value;\n"
" %vars"
"%vars"
"\n";
const auto parameters_size = align(push_constants_size, 16) / 16;
@ -302,6 +303,7 @@ namespace vk
{ "%ks", std::to_string(kernel_size) },
{ "%vars", variables },
{ "%f", function_name },
{ "%md", method_declarations },
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
};
@ -458,6 +460,7 @@ namespace vk
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
set_parameters(cmd, parameters, 4);
verify(HERE), stencil_offset > data_offset;
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
cs_shuffle_base::run(cmd, data, data_length, data_offset);
}
@ -588,6 +591,132 @@ namespace vk
}
};
template<typename To, typename From, bool _SwapSrc = false, bool _SwapDst = false>
struct cs_fconvert_task : cs_shuffle_base
{
u32 m_ssbo_length = 0;
void declare_f16_expansion()
{
method_declarations +=
"uvec2 unpack_e4m12_pack16(const in uint value)\n"
"{\n"
" uvec2 result = uvec2(bitfieldExtract(value, 0, 16), bitfieldExtract(value, 16, 16));\n"
" result <<= 11;\n"
" result += (120 << 23);\n"
" return result;\n"
"}\n\n";
}
void declare_f16_contraction()
{
method_declarations +=
"uint pack_e4m12_pack16(const in uvec2 value)\n"
"{\n"
" uvec2 result = (value - (120 << 23)) >> 11;\n"
" return (result.x & 0xFFFF) | (result.y << 16);\n"
"}\n\n";
}
cs_fconvert_task()
{
use_push_constants = true;
push_constants_size = 16;
variables =
" uint block_length = params[0].x >> 2;\n"
" uint in_offset = params[0].y >> 2;\n"
" uint out_offset = params[0].z >> 2;\n"
" uvec4 tmp;\n";
work_kernel +=
" if (index >= block_length)\n"
" return;\n";
if constexpr (sizeof(From) == 4)
{
static_assert(sizeof(To) == 2);
declare_f16_contraction();
work_kernel +=
" const uint src_offset = (index * 2) + in_offset;\n"
" const uint dst_offset = index + out_offset;\n"
" tmp.x = data[src_offset];\n"
" tmp.y = data[src_offset + 1];\n";
if constexpr (_SwapSrc)
{
work_kernel +=
" tmp = bswap_u32(tmp);\n";
}
// Convert
work_kernel += " tmp.z = pack_e4m12_pack16(tmp);\n";
if constexpr (_SwapDst)
{
work_kernel += " tmp.z = bswap_u16(tmp.z);\n";
}
work_kernel += " data[dst_offset] = tmp.z;\n";
}
else
{
static_assert(sizeof(To) == 4);
declare_f16_expansion();
work_kernel +=
" const uint src_offset = index + in_offset;\n"
" const uint dst_offset = (index * 2) + out_offset;\n"
" tmp.x = data[src_offset];\n";
if constexpr (_SwapSrc)
{
work_kernel +=
" tmp.x = bswap_u16(tmp.x);\n";
}
// Convert
work_kernel += " tmp.yz = unpack_e4m12_pack16(tmp.x);\n";
if constexpr (_SwapDst)
{
work_kernel += " tmp.yz = bswap_u16(tmp.yz);\n";
}
work_kernel +=
" data[dst_offset] = tmp.y;\n"
" data[dst_offset + 1] = tmp.z;\n";
}
cs_shuffle_base::build("");
}
void bind_resources() override
{
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 src_offset, u32 src_length, u32 dst_offset)
{
u32 data_offset;
if (src_offset > dst_offset)
{
m_ssbo_length = (src_offset + src_length) - dst_offset;
data_offset = dst_offset;
}
else
{
m_ssbo_length = (dst_offset - src_offset) + (src_length / sizeof(From)) * sizeof(To);
data_offset = src_offset;
}
u32 parameters[4] = { src_length, src_offset - data_offset, dst_offset - data_offset, 0 };
set_parameters(cmd, parameters, 4);
cs_shuffle_base::run(cmd, data, src_length, data_offset);
}
};
// Reverse morton-order block arrangement
struct cs_deswizzle_base : compute_task
{

View File

@ -335,6 +335,7 @@ namespace vk
case VK_FORMAT_R32G32B32A32_SFLOAT:
return 16;
case VK_FORMAT_D16_UNORM:
case VK_FORMAT_D32_SFLOAT:
return 2;
case VK_FORMAT_D32_SFLOAT_S8_UINT: //TODO: Translate to D24S8
case VK_FORMAT_D24_UNORM_S8_UINT:
@ -396,6 +397,7 @@ namespace vk
return{ 4, 1 };
//Depth
case VK_FORMAT_D16_UNORM:
case VK_FORMAT_D32_SFLOAT:
return{ 2, 1 };
case VK_FORMAT_D32_SFLOAT_S8_UINT:
case VK_FORMAT_D24_UNORM_S8_UINT:

View File

@ -80,7 +80,47 @@ namespace vk
}
case VK_FORMAT_D32_SFLOAT:
{
fmt::throw_exception("Unsupported transfer (D16_FLOAT");
rsx_log.error("Unsupported transfer (D16_FLOAT)"); // Need real games to test this.
verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT;
const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width;
const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height;
const u32 packed32_length = out_w * out_h * 4;
const u32 packed16_length = out_w * out_h * 2;
const auto allocation_end = region.bufferOffset + packed32_length + packed16_length;
verify(HERE), dst->size() >= allocation_end;
const auto data_offset = u32(region.bufferOffset);
const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
// 1. Copy the depth to buffer
VkBufferImageCopy region2;
region2 = region;
region2.bufferOffset = z32_offset;
vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, &region2);
// 2. Pre-compute barrier
vk::insert_buffer_memory_barrier(cmd, dst->value, z32_offset, packed32_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
// 3. Do conversion with byteswap [D32->D16F]
if (!swap_bytes) [[likely]]
{
auto job = vk::get_compute_task<vk::cs_fconvert_task<u16, u32>>();
job->run(cmd, dst, z32_offset, packed32_length, data_offset);
}
else
{
auto job = vk::get_compute_task<vk::cs_fconvert_task<u16, u32, false, true>>();
job->run(cmd, dst, z32_offset, packed32_length, data_offset);
}
// 4. Post-compute barrier
vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed16_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
break;
}
case VK_FORMAT_D24_UNORM_S8_UINT:
@ -177,7 +217,38 @@ namespace vk
}
case VK_FORMAT_D32_SFLOAT:
{
fmt::throw_exception("Unsupported transfer (D16_FLOAT");
rsx_log.error("Unsupported transfer (D16_FLOAT)");
verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT;
const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width;
const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height;
const u32 packed32_length = out_w * out_h * 4;
const u32 packed16_length = out_w * out_h * 2;
const auto allocation_end = region.bufferOffset + packed32_length + packed16_length;
verify(HERE), src->size() >= allocation_end;
const auto data_offset = u32(region.bufferOffset);
const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
// 1. Pre-compute barrier
vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
// 2. Do conversion with byteswap [D16F->D32F]
auto job = vk::get_compute_task<vk::cs_fconvert_task<u32, u16>>();
job->run(cmd, src, data_offset, packed16_length, z32_offset);
// 4. Post-compute barrier
vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
// 5. Copy the depth data to image
VkBufferImageCopy region2 = region;
region2.bufferOffset = z32_offset;
vkCmdCopyBufferToImage(cmd, src->value, dst->value, dst->current_layout, 1, &region2);
break;
}
case VK_FORMAT_D24_UNORM_S8_UINT:
@ -770,6 +841,7 @@ namespace vk
const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
{
const bool requires_depth_processing = (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || (format == CELL_GCM_TEXTURE_DEPTH16_FLOAT);
u32 block_in_pixel = rsx::get_format_block_size_in_texel(format);
u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
@ -842,7 +914,7 @@ namespace vk
copy_info.imageSubresource.mipLevel = layout.level;
copy_info.bufferRowLength = std::max<u32>(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel);
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
{
if (!scratch_buf)
{
@ -871,7 +943,7 @@ namespace vk
}
}
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
{
verify(HERE), scratch_buf;
vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
@ -902,7 +974,7 @@ namespace vk
}
// CopyBufferToImage routines
if (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
if (requires_depth_processing)
{
// Upload in reverse to avoid polluting data in lower space
for (auto rIt = copy_regions.crbegin(); rIt != copy_regions.crend(); ++rIt)