1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 10:42:36 +01:00

vk: Enable gpu deswizzling

This commit is contained in:
kd-11 2019-10-29 15:21:53 +03:00 committed by kd-11
parent 9cd3530c98
commit 1266b63135
5 changed files with 159 additions and 42 deletions

View File

@ -668,13 +668,19 @@ texture_memory_info upload_texture_subresource(gsl::span<gsl::byte> dst_buffer,
{
result.require_swap = true;
result.element_size = word_size;
result.block_length = words_per_block;
if (word_size == 2)
{
const bool skip_swizzle = ((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle;
if (is_swizzled && skip_swizzle) result.require_deswizzle = true;
if (is_swizzled)
{
if (((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle)
{
result.require_deswizzle = true;
}
}
if (is_swizzled && !skip_swizzle)
if (is_swizzled && !result.require_deswizzle)
copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);

View File

@ -104,6 +104,7 @@ struct rsx_subresource_layout
struct texture_memory_info
{
int element_size;
int block_length;
bool require_swap;
bool require_deswizzle;
};

View File

@ -2,7 +2,7 @@
#include "VKHelpers.h"
#include "Utilities/StrUtil.h"
#define VK_MAX_COMPUTE_TASKS 1024 // Max number of jobs per frame
#define VK_MAX_COMPUTE_TASKS 32768 // Max number of jobs per frame
namespace vk
{
@ -22,7 +22,9 @@ namespace vk
bool initialized = false;
bool unroll_loops = true;
bool uniform_inputs = false;
bool use_push_constants = false;
u32 ssbo_count = 1;
u32 push_constants_size = 0;
u32 optimal_group_size = 1;
u32 optimal_kernel_size = 1;
@ -77,6 +79,16 @@ namespace vk
layout_info.setLayoutCount = 1;
layout_info.pSetLayouts = &m_descriptor_layout;
VkPushConstantRange push_constants{};
if (use_push_constants)
{
push_constants.size = push_constants_size;
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
layout_info.pushConstantRangeCount = 1;
layout_info.pPushConstantRanges = &push_constants;
}
CHECK_RESULT(vkCreatePipelineLayout(*get_current_renderer(), &layout_info, nullptr, &m_pipeline_layout));
}
@ -258,7 +270,7 @@ namespace vk
"\n"
"void main()\n"
"{\n"
" uint index = %idx;\n"
" uint index = gl_GlobalInvocationID.x;\n"
" uint value;\n"
" %vars"
"\n";
@ -550,19 +562,26 @@ namespace vk
};
// Reverse morton-order block arrangement
struct cs_deswizzle_base : compute_task
{
virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) = 0;
};
template <typename _BlockType, typename _BaseType, bool _SwapBytes>
struct cs_deswizzle_3d : compute_task
struct cs_deswizzle_3d : cs_deswizzle_base
{
union params_t
{
u32 data[4];
u32 data[6];
struct
{
u32 width;
u32 height;
u32 depth;
u32 logw;
u32 logh;
u32 logd;
};
}
params;
@ -578,25 +597,29 @@ namespace vk
verify("Unsupported block type" HERE), (sizeof(_BlockType) & 3) == 0;
ssbo_count = 2;
uniform_inputs = true;
use_push_constants = true;
push_constants_size = 24;
create();
m_src =
"#version 450\n"
"layout(local_size_x = 8, local_size_y = 8, local_size_z = 1)\n\n"
"layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n\n"
"layout(set=0, binding=0, std430) buffer ssbo{ uint data_in[]; }\n"
"layout(set=0, binding=1, std430) buffer ssbo{ uint data_out[]; }\n"
"layout(set=0, binding=2, std140) uniform buffer parameters\n"
"layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n"
"layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n"
"layout(push_constant) uniform parameters\n"
"{\n"
" uint image_width;\n"
" uint image_height;\n"
" uint image_depth;\n"
" uint image_logw;\n"
" uint image_logh;\n"
" uint image_logd;\n"
"};\n\n"
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
"uint get_z_index(uint x, uint y, uint z, uint log2w, uint log2h, uint log2d)\n"
"{\n"
@ -629,26 +652,29 @@ namespace vk
" log2d--;\n"
" }\n"
" }\n"
" while(x > 0 || y > 0 || z > 0)\n"
" while(x > 0 || y > 0 || z > 0);\n"
"\n"
" return offset;\n"
"}\n\n"
"void main()\n"
"{\n"
" if (gl_GlobalInvocationID.x >= image_width || gl_GlobalInvocationID.y >= image_height)\n"
" if (any(greaterThanEqual(gl_GlobalInvocationID, uvec3(image_width, image_height, image_depth))))\n"
" return;\n\n"
" uint texel_id = (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x"
" uint texel_id = (gl_GlobalInvocationID.z * image_width * image_height) + (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x;\n"
" uint word_count = %_wordcount;\n"
" uint dst_id = (index * word_count);\n\n"
" uint dst_id = (texel_id * word_count);\n\n"
" uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z, image_logw, image_logh, image_logd);\n"
" src_id *= word_count;\n\n"
" uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocation.y, 0, image_logw, image_logh, 0);\n"
" for (uint i = 0; i < word_count; ++i)\n"
" {\n"
" data_out[dst_id++] = %f(data_in[src_id++]);\n"
" uint value = data_in[src_id++];\n"
" data_out[dst_id++] = %f(value);\n"
" }\n\n"
"}\n";
std::string transform;
@ -681,24 +707,14 @@ namespace vk
{
m_program->bind_buffer({ src_buffer->value, in_offset, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
m_program->bind_buffer({ dst_buffer->value, out_offset, block_length }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
m_program->bind_buffer({ m_param_buffer->value, 0, 16 }, 2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, m_descriptor_set);
}
void set_parameters(VkCommandBuffer cmd)
{
verify(HERE), uniform_inputs;
if (!m_param_buffer)
{
auto pdev = vk::get_current_renderer();
m_param_buffer = std::make_unique<vk::buffer>(*pdev, 256, pdev->get_memory_mapping().host_visible_coherent,
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
}
vkCmdUpdateBuffer(cmd, m_param_buffer->value, 0, 16, params.data);
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, 24, params.data);
}
void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth)
void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) override
{
dst_buffer = dst;
src_buffer = src;
@ -708,14 +724,16 @@ namespace vk
this->block_length = sizeof(_BlockType) * width * height * depth;
params.width = width;
params.height = height * depth;
params.height = height;
params.depth = depth;
params.logw = rsx::ceil_log2(width);
params.logh = rsx::ceil_log2(height);
set_parameters();
params.logd = rsx::ceil_log2(depth);
set_parameters(cmd);
const u32 invocations_x = align(params.width, 8) / 8;
const u32 invocations_y = align(params.height, 8) / 8;
compute_task::run(cmd, invocations_x, invocations_y, 1);
compute_task::run(cmd, invocations_x, invocations_y, depth);
}
};

View File

@ -3299,6 +3299,7 @@ public:
std::string shader_type = type == ::glsl::program_domain::glsl_vertex_program ? "vertex" :
type == ::glsl::program_domain::glsl_fragment_program ? "fragment" : "compute";
LOG_NOTICE(RSX, "%s", m_source);
fmt::throw_exception("Failed to compile %s shader" HERE, shader_type);
}

View File

@ -538,6 +538,90 @@ namespace vk
change_image_layout(cmd, dst, preferred_dst_format, dstLayout, vk::get_image_subresource_range(0, 0, 1, 1, aspect));
}
void gpu_deswizzle_sections_impl(VkCommandBuffer cmd, vk::buffer* scratch_buf, u32 dst_offset, int word_size, int word_count, bool swap_bytes, std::vector<VkBufferImageCopy>& sections)
{
// NOTE: This has to be done individually for every LOD
vk::cs_deswizzle_base* job = nullptr;
const auto block_size = (word_size * word_count);
verify(HERE), word_size == 4 || word_size == 2;
if (!swap_bytes)
{
if (word_size == 4)
{
switch (block_size)
{
case 4:
job = vk::get_compute_task<cs_deswizzle_3d<u32, u32, false>>();
break;
case 8:
job = vk::get_compute_task<cs_deswizzle_3d<u64, u32, false>>();
break;
case 16:
job = vk::get_compute_task<cs_deswizzle_3d<u128, u32, false>>();
break;
}
}
else
{
switch (block_size)
{
case 4:
job = vk::get_compute_task<cs_deswizzle_3d<u32, u16, false>>();
break;
case 8:
job = vk::get_compute_task<cs_deswizzle_3d<u64, u16, false>>();
break;
}
}
}
else
{
if (word_size == 4)
{
switch (block_size)
{
case 4:
job = vk::get_compute_task<cs_deswizzle_3d<u32, u32, true>>();
break;
case 8:
job = vk::get_compute_task<cs_deswizzle_3d<u64, u32, true>>();
break;
case 16:
job = vk::get_compute_task<cs_deswizzle_3d<u128, u32, true>>();
break;
}
}
else
{
switch (block_size)
{
case 4:
job = vk::get_compute_task<cs_deswizzle_3d<u32, u16, true>>();
break;
case 8:
job = vk::get_compute_task<cs_deswizzle_3d<u64, u16, true>>();
break;
}
}
}
verify(HERE), job;
for (auto &section : sections)
{
job->run(cmd, scratch_buf, dst_offset, scratch_buf, section.bufferOffset,
section.imageExtent.width, section.imageExtent.height, section.imageExtent.depth);
const u32 packed_size = section.imageExtent.width * section.imageExtent.height * section.imageExtent.depth * block_size;
section.bufferOffset = dst_offset;
dst_offset += packed_size;
}
verify(HERE), dst_offset <= scratch_buf->size();
}
void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image,
const std::vector<rsx_subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
@ -600,7 +684,7 @@ namespace vk
copy_info.imageSubresource.mipLevel = layout.level;
copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;
if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
{
if (!scratch_buf)
{
@ -623,7 +707,7 @@ namespace vk
}
}
if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
{
verify(HERE), scratch_buf;
vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, (u32)buffer_copies.size(), buffer_copies.data());
@ -632,8 +716,12 @@ namespace vk
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
}
// Swap if requested
if (opt.require_swap)
// Swap and swizzle if requested
if (opt.require_deswizzle)
{
gpu_deswizzle_sections_impl(cmd, scratch_buf, scratch_offset, opt.element_size, opt.block_length, opt.require_swap, copy_regions);
}
else if (opt.require_swap)
{
if (opt.element_size == 4)
{
@ -658,9 +746,12 @@ namespace vk
vk::copy_buffer_to_image(cmd, scratch_buf, dst_image, *rIt);
}
}
else if (opt.require_swap)
else if (scratch_buf)
{
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
verify(HERE), opt.require_deswizzle || opt.require_swap;
const auto block_start = copy_regions.front().bufferOffset;
insert_buffer_memory_barrier(cmd, scratch_buf->value, block_start, scratch_offset, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, (u32)copy_regions.size(), copy_regions.data());