mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 10:42:36 +01:00
vk: Improve D16F handling
- Adds upload and download routines. Mostly untested, which is why the error message exists
This commit is contained in:
parent
9a51f22265
commit
af9e217fa4
@ -286,13 +286,14 @@ namespace vk
|
|||||||
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
||||||
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
"%md"
|
||||||
"void main()\n"
|
"void main()\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
||||||
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
||||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||||
" uint value;\n"
|
" uint value;\n"
|
||||||
" %vars"
|
"%vars"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
const auto parameters_size = align(push_constants_size, 16) / 16;
|
const auto parameters_size = align(push_constants_size, 16) / 16;
|
||||||
@ -302,6 +303,7 @@ namespace vk
|
|||||||
{ "%ks", std::to_string(kernel_size) },
|
{ "%ks", std::to_string(kernel_size) },
|
||||||
{ "%vars", variables },
|
{ "%vars", variables },
|
||||||
{ "%f", function_name },
|
{ "%f", function_name },
|
||||||
|
{ "%md", method_declarations },
|
||||||
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -458,6 +460,7 @@ namespace vk
|
|||||||
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
||||||
set_parameters(cmd, parameters, 4);
|
set_parameters(cmd, parameters, 4);
|
||||||
|
|
||||||
|
verify(HERE), stencil_offset > data_offset;
|
||||||
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
||||||
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
||||||
}
|
}
|
||||||
@ -588,6 +591,132 @@ namespace vk
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename To, typename From, bool _SwapSrc = false, bool _SwapDst = false>
|
||||||
|
struct cs_fconvert_task : cs_shuffle_base
|
||||||
|
{
|
||||||
|
u32 m_ssbo_length = 0;
|
||||||
|
|
||||||
|
void declare_f16_expansion()
|
||||||
|
{
|
||||||
|
method_declarations +=
|
||||||
|
"uvec2 unpack_e4m12_pack16(const in uint value)\n"
|
||||||
|
"{\n"
|
||||||
|
" uvec2 result = uvec2(bitfieldExtract(value, 0, 16), bitfieldExtract(value, 16, 16));\n"
|
||||||
|
" result <<= 11;\n"
|
||||||
|
" result += (120 << 23);\n"
|
||||||
|
" return result;\n"
|
||||||
|
"}\n\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
void declare_f16_contraction()
|
||||||
|
{
|
||||||
|
method_declarations +=
|
||||||
|
"uint pack_e4m12_pack16(const in uvec2 value)\n"
|
||||||
|
"{\n"
|
||||||
|
" uvec2 result = (value - (120 << 23)) >> 11;\n"
|
||||||
|
" return (result.x & 0xFFFF) | (result.y << 16);\n"
|
||||||
|
"}\n\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_fconvert_task()
|
||||||
|
{
|
||||||
|
use_push_constants = true;
|
||||||
|
push_constants_size = 16;
|
||||||
|
|
||||||
|
variables =
|
||||||
|
" uint block_length = params[0].x >> 2;\n"
|
||||||
|
" uint in_offset = params[0].y >> 2;\n"
|
||||||
|
" uint out_offset = params[0].z >> 2;\n"
|
||||||
|
" uvec4 tmp;\n";
|
||||||
|
|
||||||
|
work_kernel +=
|
||||||
|
" if (index >= block_length)\n"
|
||||||
|
" return;\n";
|
||||||
|
|
||||||
|
if constexpr (sizeof(From) == 4)
|
||||||
|
{
|
||||||
|
static_assert(sizeof(To) == 2);
|
||||||
|
declare_f16_contraction();
|
||||||
|
|
||||||
|
work_kernel +=
|
||||||
|
" const uint src_offset = (index * 2) + in_offset;\n"
|
||||||
|
" const uint dst_offset = index + out_offset;\n"
|
||||||
|
" tmp.x = data[src_offset];\n"
|
||||||
|
" tmp.y = data[src_offset + 1];\n";
|
||||||
|
|
||||||
|
if constexpr (_SwapSrc)
|
||||||
|
{
|
||||||
|
work_kernel +=
|
||||||
|
" tmp = bswap_u32(tmp);\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert
|
||||||
|
work_kernel += " tmp.z = pack_e4m12_pack16(tmp);\n";
|
||||||
|
|
||||||
|
if constexpr (_SwapDst)
|
||||||
|
{
|
||||||
|
work_kernel += " tmp.z = bswap_u16(tmp.z);\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
work_kernel += " data[dst_offset] = tmp.z;\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
static_assert(sizeof(To) == 4);
|
||||||
|
declare_f16_expansion();
|
||||||
|
|
||||||
|
work_kernel +=
|
||||||
|
" const uint src_offset = index + in_offset;\n"
|
||||||
|
" const uint dst_offset = (index * 2) + out_offset;\n"
|
||||||
|
" tmp.x = data[src_offset];\n";
|
||||||
|
|
||||||
|
if constexpr (_SwapSrc)
|
||||||
|
{
|
||||||
|
work_kernel +=
|
||||||
|
" tmp.x = bswap_u16(tmp.x);\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert
|
||||||
|
work_kernel += " tmp.yz = unpack_e4m12_pack16(tmp.x);\n";
|
||||||
|
|
||||||
|
if constexpr (_SwapDst)
|
||||||
|
{
|
||||||
|
work_kernel += " tmp.yz = bswap_u16(tmp.yz);\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
work_kernel +=
|
||||||
|
" data[dst_offset] = tmp.y;\n"
|
||||||
|
" data[dst_offset + 1] = tmp.z;\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_shuffle_base::build("");
|
||||||
|
}
|
||||||
|
|
||||||
|
void bind_resources() override
|
||||||
|
{
|
||||||
|
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||||
|
}
|
||||||
|
|
||||||
|
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 src_offset, u32 src_length, u32 dst_offset)
|
||||||
|
{
|
||||||
|
u32 data_offset;
|
||||||
|
if (src_offset > dst_offset)
|
||||||
|
{
|
||||||
|
m_ssbo_length = (src_offset + src_length) - dst_offset;
|
||||||
|
data_offset = dst_offset;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_ssbo_length = (dst_offset - src_offset) + (src_length / sizeof(From)) * sizeof(To);
|
||||||
|
data_offset = src_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 parameters[4] = { src_length, src_offset - data_offset, dst_offset - data_offset, 0 };
|
||||||
|
set_parameters(cmd, parameters, 4);
|
||||||
|
cs_shuffle_base::run(cmd, data, src_length, data_offset);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Reverse morton-order block arrangement
|
// Reverse morton-order block arrangement
|
||||||
struct cs_deswizzle_base : compute_task
|
struct cs_deswizzle_base : compute_task
|
||||||
{
|
{
|
||||||
|
@ -335,6 +335,7 @@ namespace vk
|
|||||||
case VK_FORMAT_R32G32B32A32_SFLOAT:
|
case VK_FORMAT_R32G32B32A32_SFLOAT:
|
||||||
return 16;
|
return 16;
|
||||||
case VK_FORMAT_D16_UNORM:
|
case VK_FORMAT_D16_UNORM:
|
||||||
|
case VK_FORMAT_D32_SFLOAT:
|
||||||
return 2;
|
return 2;
|
||||||
case VK_FORMAT_D32_SFLOAT_S8_UINT: //TODO: Translate to D24S8
|
case VK_FORMAT_D32_SFLOAT_S8_UINT: //TODO: Translate to D24S8
|
||||||
case VK_FORMAT_D24_UNORM_S8_UINT:
|
case VK_FORMAT_D24_UNORM_S8_UINT:
|
||||||
@ -396,6 +397,7 @@ namespace vk
|
|||||||
return{ 4, 1 };
|
return{ 4, 1 };
|
||||||
//Depth
|
//Depth
|
||||||
case VK_FORMAT_D16_UNORM:
|
case VK_FORMAT_D16_UNORM:
|
||||||
|
case VK_FORMAT_D32_SFLOAT:
|
||||||
return{ 2, 1 };
|
return{ 2, 1 };
|
||||||
case VK_FORMAT_D32_SFLOAT_S8_UINT:
|
case VK_FORMAT_D32_SFLOAT_S8_UINT:
|
||||||
case VK_FORMAT_D24_UNORM_S8_UINT:
|
case VK_FORMAT_D24_UNORM_S8_UINT:
|
||||||
|
@ -80,7 +80,47 @@ namespace vk
|
|||||||
}
|
}
|
||||||
case VK_FORMAT_D32_SFLOAT:
|
case VK_FORMAT_D32_SFLOAT:
|
||||||
{
|
{
|
||||||
fmt::throw_exception("Unsupported transfer (D16_FLOAT");
|
rsx_log.error("Unsupported transfer (D16_FLOAT)"); // Need real games to test this.
|
||||||
|
verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT;
|
||||||
|
|
||||||
|
const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width;
|
||||||
|
const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height;
|
||||||
|
const u32 packed32_length = out_w * out_h * 4;
|
||||||
|
const u32 packed16_length = out_w * out_h * 2;
|
||||||
|
|
||||||
|
const auto allocation_end = region.bufferOffset + packed32_length + packed16_length;
|
||||||
|
verify(HERE), dst->size() >= allocation_end;
|
||||||
|
|
||||||
|
const auto data_offset = u32(region.bufferOffset);
|
||||||
|
const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
|
||||||
|
|
||||||
|
// 1. Copy the depth to buffer
|
||||||
|
VkBufferImageCopy region2;
|
||||||
|
region2 = region;
|
||||||
|
region2.bufferOffset = z32_offset;
|
||||||
|
vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, ®ion2);
|
||||||
|
|
||||||
|
// 2. Pre-compute barrier
|
||||||
|
vk::insert_buffer_memory_barrier(cmd, dst->value, z32_offset, packed32_length,
|
||||||
|
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||||
|
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||||
|
|
||||||
|
// 3. Do conversion with byteswap [D32->D16F]
|
||||||
|
if (!swap_bytes) [[likely]]
|
||||||
|
{
|
||||||
|
auto job = vk::get_compute_task<vk::cs_fconvert_task<u16, u32>>();
|
||||||
|
job->run(cmd, dst, z32_offset, packed32_length, data_offset);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto job = vk::get_compute_task<vk::cs_fconvert_task<u16, u32, false, true>>();
|
||||||
|
job->run(cmd, dst, z32_offset, packed32_length, data_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Post-compute barrier
|
||||||
|
vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed16_length,
|
||||||
|
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||||
|
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case VK_FORMAT_D24_UNORM_S8_UINT:
|
case VK_FORMAT_D24_UNORM_S8_UINT:
|
||||||
@ -177,7 +217,38 @@ namespace vk
|
|||||||
}
|
}
|
||||||
case VK_FORMAT_D32_SFLOAT:
|
case VK_FORMAT_D32_SFLOAT:
|
||||||
{
|
{
|
||||||
fmt::throw_exception("Unsupported transfer (D16_FLOAT");
|
rsx_log.error("Unsupported transfer (D16_FLOAT)");
|
||||||
|
verify(HERE), region.imageSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT;
|
||||||
|
|
||||||
|
const u32 out_w = region.bufferRowLength ? region.bufferRowLength : region.imageExtent.width;
|
||||||
|
const u32 out_h = region.bufferImageHeight ? region.bufferImageHeight : region.imageExtent.height;
|
||||||
|
const u32 packed32_length = out_w * out_h * 4;
|
||||||
|
const u32 packed16_length = out_w * out_h * 2;
|
||||||
|
|
||||||
|
const auto allocation_end = region.bufferOffset + packed32_length + packed16_length;
|
||||||
|
verify(HERE), src->size() >= allocation_end;
|
||||||
|
|
||||||
|
const auto data_offset = u32(region.bufferOffset);
|
||||||
|
const auto z32_offset = align<u32>(data_offset + packed16_length, 256);
|
||||||
|
|
||||||
|
// 1. Pre-compute barrier
|
||||||
|
vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
|
||||||
|
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||||
|
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||||
|
|
||||||
|
// 2. Do conversion with byteswap [D16F->D32F]
|
||||||
|
auto job = vk::get_compute_task<vk::cs_fconvert_task<u32, u16>>();
|
||||||
|
job->run(cmd, src, data_offset, packed16_length, z32_offset);
|
||||||
|
|
||||||
|
// 4. Post-compute barrier
|
||||||
|
vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length,
|
||||||
|
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||||
|
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||||
|
|
||||||
|
// 5. Copy the depth data to image
|
||||||
|
VkBufferImageCopy region2 = region;
|
||||||
|
region2.bufferOffset = z32_offset;
|
||||||
|
vkCmdCopyBufferToImage(cmd, src->value, dst->value, dst->current_layout, 1, ®ion2);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case VK_FORMAT_D24_UNORM_S8_UINT:
|
case VK_FORMAT_D24_UNORM_S8_UINT:
|
||||||
@ -770,6 +841,7 @@ namespace vk
|
|||||||
const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
|
const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
|
||||||
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
|
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
|
||||||
{
|
{
|
||||||
|
const bool requires_depth_processing = (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || (format == CELL_GCM_TEXTURE_DEPTH16_FLOAT);
|
||||||
u32 block_in_pixel = rsx::get_format_block_size_in_texel(format);
|
u32 block_in_pixel = rsx::get_format_block_size_in_texel(format);
|
||||||
u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
|
u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
|
||||||
|
|
||||||
@ -842,7 +914,7 @@ namespace vk
|
|||||||
copy_info.imageSubresource.mipLevel = layout.level;
|
copy_info.imageSubresource.mipLevel = layout.level;
|
||||||
copy_info.bufferRowLength = std::max<u32>(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel);
|
copy_info.bufferRowLength = std::max<u32>(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel);
|
||||||
|
|
||||||
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
|
||||||
{
|
{
|
||||||
if (!scratch_buf)
|
if (!scratch_buf)
|
||||||
{
|
{
|
||||||
@ -871,7 +943,7 @@ namespace vk
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
|
||||||
{
|
{
|
||||||
verify(HERE), scratch_buf;
|
verify(HERE), scratch_buf;
|
||||||
vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
|
vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
|
||||||
@ -902,7 +974,7 @@ namespace vk
|
|||||||
}
|
}
|
||||||
|
|
||||||
// CopyBufferToImage routines
|
// CopyBufferToImage routines
|
||||||
if (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
if (requires_depth_processing)
|
||||||
{
|
{
|
||||||
// Upload in reverse to avoid polluting data in lower space
|
// Upload in reverse to avoid polluting data in lower space
|
||||||
for (auto rIt = copy_regions.crbegin(); rIt != copy_regions.crend(); ++rIt)
|
for (auto rIt = copy_regions.crbegin(); rIt != copy_regions.crend(); ++rIt)
|
||||||
|
Loading…
Reference in New Issue
Block a user