mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 10:42:36 +01:00
vk: Enable gpu deswizzling
This commit is contained in:
parent
9cd3530c98
commit
1266b63135
@ -668,13 +668,19 @@ texture_memory_info upload_texture_subresource(gsl::span<gsl::byte> dst_buffer,
|
||||
{
|
||||
result.require_swap = true;
|
||||
result.element_size = word_size;
|
||||
result.block_length = words_per_block;
|
||||
|
||||
if (word_size == 2)
|
||||
{
|
||||
const bool skip_swizzle = ((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle;
|
||||
if (is_swizzled && skip_swizzle) result.require_deswizzle = true;
|
||||
if (is_swizzled)
|
||||
{
|
||||
if (((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle)
|
||||
{
|
||||
result.require_deswizzle = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_swizzled && !skip_swizzle)
|
||||
if (is_swizzled && !result.require_deswizzle)
|
||||
copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
|
||||
else
|
||||
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
|
||||
|
@ -104,6 +104,7 @@ struct rsx_subresource_layout
|
||||
struct texture_memory_info
|
||||
{
|
||||
int element_size;
|
||||
int block_length;
|
||||
bool require_swap;
|
||||
bool require_deswizzle;
|
||||
};
|
||||
|
@ -2,7 +2,7 @@
|
||||
#include "VKHelpers.h"
|
||||
#include "Utilities/StrUtil.h"
|
||||
|
||||
#define VK_MAX_COMPUTE_TASKS 1024 // Max number of jobs per frame
|
||||
#define VK_MAX_COMPUTE_TASKS 32768 // Max number of jobs per frame
|
||||
|
||||
namespace vk
|
||||
{
|
||||
@ -22,7 +22,9 @@ namespace vk
|
||||
bool initialized = false;
|
||||
bool unroll_loops = true;
|
||||
bool uniform_inputs = false;
|
||||
bool use_push_constants = false;
|
||||
u32 ssbo_count = 1;
|
||||
u32 push_constants_size = 0;
|
||||
u32 optimal_group_size = 1;
|
||||
u32 optimal_kernel_size = 1;
|
||||
|
||||
@ -77,6 +79,16 @@ namespace vk
|
||||
layout_info.setLayoutCount = 1;
|
||||
layout_info.pSetLayouts = &m_descriptor_layout;
|
||||
|
||||
VkPushConstantRange push_constants{};
|
||||
if (use_push_constants)
|
||||
{
|
||||
push_constants.size = push_constants_size;
|
||||
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
|
||||
layout_info.pushConstantRangeCount = 1;
|
||||
layout_info.pPushConstantRanges = &push_constants;
|
||||
}
|
||||
|
||||
CHECK_RESULT(vkCreatePipelineLayout(*get_current_renderer(), &layout_info, nullptr, &m_pipeline_layout));
|
||||
}
|
||||
|
||||
@ -258,7 +270,7 @@ namespace vk
|
||||
"\n"
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" uint index = %idx;\n"
|
||||
" uint index = gl_GlobalInvocationID.x;\n"
|
||||
" uint value;\n"
|
||||
" %vars"
|
||||
"\n";
|
||||
@ -550,19 +562,26 @@ namespace vk
|
||||
};
|
||||
|
||||
// Reverse morton-order block arrangement
|
||||
struct cs_deswizzle_base : compute_task
|
||||
{
|
||||
virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) = 0;
|
||||
};
|
||||
|
||||
template <typename _BlockType, typename _BaseType, bool _SwapBytes>
|
||||
struct cs_deswizzle_3d : compute_task
|
||||
struct cs_deswizzle_3d : cs_deswizzle_base
|
||||
{
|
||||
union params_t
|
||||
{
|
||||
u32 data[4];
|
||||
u32 data[6];
|
||||
|
||||
struct
|
||||
{
|
||||
u32 width;
|
||||
u32 height;
|
||||
u32 depth;
|
||||
u32 logw;
|
||||
u32 logh;
|
||||
u32 logd;
|
||||
};
|
||||
}
|
||||
params;
|
||||
@ -578,25 +597,29 @@ namespace vk
|
||||
verify("Unsupported block type" HERE), (sizeof(_BlockType) & 3) == 0;
|
||||
|
||||
ssbo_count = 2;
|
||||
uniform_inputs = true;
|
||||
use_push_constants = true;
|
||||
push_constants_size = 24;
|
||||
|
||||
create();
|
||||
|
||||
m_src =
|
||||
"#version 450\n"
|
||||
"layout(local_size_x = 8, local_size_y = 8, local_size_z = 1)\n\n"
|
||||
"layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;\n\n"
|
||||
|
||||
"layout(set=0, binding=0, std430) buffer ssbo{ uint data_in[]; }\n"
|
||||
"layout(set=0, binding=1, std430) buffer ssbo{ uint data_out[]; }\n"
|
||||
"layout(set=0, binding=2, std140) uniform buffer parameters\n"
|
||||
"layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n"
|
||||
"layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n"
|
||||
"layout(push_constant) uniform parameters\n"
|
||||
"{\n"
|
||||
" uint image_width;\n"
|
||||
" uint image_height;\n"
|
||||
" uint image_depth;\n"
|
||||
" uint image_logw;\n"
|
||||
" uint image_logh;\n"
|
||||
" uint image_logd;\n"
|
||||
"};\n\n"
|
||||
|
||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||
|
||||
"uint get_z_index(uint x, uint y, uint z, uint log2w, uint log2h, uint log2d)\n"
|
||||
"{\n"
|
||||
@ -629,26 +652,29 @@ namespace vk
|
||||
" log2d--;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" while(x > 0 || y > 0 || z > 0)\n"
|
||||
" while(x > 0 || y > 0 || z > 0);\n"
|
||||
"\n"
|
||||
" return offset;\n"
|
||||
"}\n\n"
|
||||
|
||||
"void main()\n"
|
||||
"{\n"
|
||||
" if (gl_GlobalInvocationID.x >= image_width || gl_GlobalInvocationID.y >= image_height)\n"
|
||||
" if (any(greaterThanEqual(gl_GlobalInvocationID, uvec3(image_width, image_height, image_depth))))\n"
|
||||
" return;\n\n"
|
||||
|
||||
" uint texel_id = (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x"
|
||||
" uint texel_id = (gl_GlobalInvocationID.z * image_width * image_height) + (gl_GlobalInvocationID.y * image_width) + gl_GlobalInvocationID.x;\n"
|
||||
" uint word_count = %_wordcount;\n"
|
||||
" uint dst_id = (index * word_count);\n\n"
|
||||
" uint dst_id = (texel_id * word_count);\n\n"
|
||||
|
||||
" uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z, image_logw, image_logh, image_logd);\n"
|
||||
" src_id *= word_count;\n\n"
|
||||
|
||||
" uint src_id = get_z_index(gl_GlobalInvocationID.x, gl_GlobalInvocation.y, 0, image_logw, image_logh, 0);\n"
|
||||
" for (uint i = 0; i < word_count; ++i)\n"
|
||||
" {\n"
|
||||
" data_out[dst_id++] = %f(data_in[src_id++]);\n"
|
||||
" uint value = data_in[src_id++];\n"
|
||||
" data_out[dst_id++] = %f(value);\n"
|
||||
" }\n\n"
|
||||
|
||||
|
||||
"}\n";
|
||||
|
||||
std::string transform;
|
||||
@ -681,24 +707,14 @@ namespace vk
|
||||
{
|
||||
m_program->bind_buffer({ src_buffer->value, in_offset, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
m_program->bind_buffer({ dst_buffer->value, out_offset, block_length }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||
m_program->bind_buffer({ m_param_buffer->value, 0, 16 }, 2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, m_descriptor_set);
|
||||
}
|
||||
|
||||
void set_parameters(VkCommandBuffer cmd)
|
||||
{
|
||||
verify(HERE), uniform_inputs;
|
||||
|
||||
if (!m_param_buffer)
|
||||
{
|
||||
auto pdev = vk::get_current_renderer();
|
||||
m_param_buffer = std::make_unique<vk::buffer>(*pdev, 256, pdev->get_memory_mapping().host_visible_coherent,
|
||||
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
|
||||
}
|
||||
|
||||
vkCmdUpdateBuffer(cmd, m_param_buffer->value, 0, 16, params.data);
|
||||
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, 24, params.data);
|
||||
}
|
||||
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth)
|
||||
void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 width, u32 height, u32 depth) override
|
||||
{
|
||||
dst_buffer = dst;
|
||||
src_buffer = src;
|
||||
@ -708,14 +724,16 @@ namespace vk
|
||||
this->block_length = sizeof(_BlockType) * width * height * depth;
|
||||
|
||||
params.width = width;
|
||||
params.height = height * depth;
|
||||
params.height = height;
|
||||
params.depth = depth;
|
||||
params.logw = rsx::ceil_log2(width);
|
||||
params.logh = rsx::ceil_log2(height);
|
||||
set_parameters();
|
||||
params.logd = rsx::ceil_log2(depth);
|
||||
set_parameters(cmd);
|
||||
|
||||
const u32 invocations_x = align(params.width, 8) / 8;
|
||||
const u32 invocations_y = align(params.height, 8) / 8;
|
||||
compute_task::run(cmd, invocations_x, invocations_y, 1);
|
||||
compute_task::run(cmd, invocations_x, invocations_y, depth);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -3299,6 +3299,7 @@ public:
|
||||
std::string shader_type = type == ::glsl::program_domain::glsl_vertex_program ? "vertex" :
|
||||
type == ::glsl::program_domain::glsl_fragment_program ? "fragment" : "compute";
|
||||
|
||||
LOG_NOTICE(RSX, "%s", m_source);
|
||||
fmt::throw_exception("Failed to compile %s shader" HERE, shader_type);
|
||||
}
|
||||
|
||||
|
@ -538,6 +538,90 @@ namespace vk
|
||||
change_image_layout(cmd, dst, preferred_dst_format, dstLayout, vk::get_image_subresource_range(0, 0, 1, 1, aspect));
|
||||
}
|
||||
|
||||
void gpu_deswizzle_sections_impl(VkCommandBuffer cmd, vk::buffer* scratch_buf, u32 dst_offset, int word_size, int word_count, bool swap_bytes, std::vector<VkBufferImageCopy>& sections)
|
||||
{
|
||||
// NOTE: This has to be done individually for every LOD
|
||||
vk::cs_deswizzle_base* job = nullptr;
|
||||
const auto block_size = (word_size * word_count);
|
||||
|
||||
verify(HERE), word_size == 4 || word_size == 2;
|
||||
|
||||
if (!swap_bytes)
|
||||
{
|
||||
if (word_size == 4)
|
||||
{
|
||||
switch (block_size)
|
||||
{
|
||||
case 4:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u32, u32, false>>();
|
||||
break;
|
||||
case 8:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u64, u32, false>>();
|
||||
break;
|
||||
case 16:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u128, u32, false>>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (block_size)
|
||||
{
|
||||
case 4:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u32, u16, false>>();
|
||||
break;
|
||||
case 8:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u64, u16, false>>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (word_size == 4)
|
||||
{
|
||||
switch (block_size)
|
||||
{
|
||||
case 4:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u32, u32, true>>();
|
||||
break;
|
||||
case 8:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u64, u32, true>>();
|
||||
break;
|
||||
case 16:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u128, u32, true>>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (block_size)
|
||||
{
|
||||
case 4:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u32, u16, true>>();
|
||||
break;
|
||||
case 8:
|
||||
job = vk::get_compute_task<cs_deswizzle_3d<u64, u16, true>>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
verify(HERE), job;
|
||||
|
||||
for (auto §ion : sections)
|
||||
{
|
||||
job->run(cmd, scratch_buf, dst_offset, scratch_buf, section.bufferOffset,
|
||||
section.imageExtent.width, section.imageExtent.height, section.imageExtent.depth);
|
||||
|
||||
const u32 packed_size = section.imageExtent.width * section.imageExtent.height * section.imageExtent.depth * block_size;
|
||||
section.bufferOffset = dst_offset;
|
||||
dst_offset += packed_size;
|
||||
}
|
||||
|
||||
verify(HERE), dst_offset <= scratch_buf->size();
|
||||
}
|
||||
|
||||
void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image,
|
||||
const std::vector<rsx_subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
|
||||
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
|
||||
@ -600,7 +684,7 @@ namespace vk
|
||||
copy_info.imageSubresource.mipLevel = layout.level;
|
||||
copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;
|
||||
|
||||
if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
||||
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
||||
{
|
||||
if (!scratch_buf)
|
||||
{
|
||||
@ -623,7 +707,7 @@ namespace vk
|
||||
}
|
||||
}
|
||||
|
||||
if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
||||
if (opt.require_swap || opt.require_deswizzle || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
|
||||
{
|
||||
verify(HERE), scratch_buf;
|
||||
vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, (u32)buffer_copies.size(), buffer_copies.data());
|
||||
@ -632,8 +716,12 @@ namespace vk
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
|
||||
}
|
||||
|
||||
// Swap if requested
|
||||
if (opt.require_swap)
|
||||
// Swap and swizzle if requested
|
||||
if (opt.require_deswizzle)
|
||||
{
|
||||
gpu_deswizzle_sections_impl(cmd, scratch_buf, scratch_offset, opt.element_size, opt.block_length, opt.require_swap, copy_regions);
|
||||
}
|
||||
else if (opt.require_swap)
|
||||
{
|
||||
if (opt.element_size == 4)
|
||||
{
|
||||
@ -658,9 +746,12 @@ namespace vk
|
||||
vk::copy_buffer_to_image(cmd, scratch_buf, dst_image, *rIt);
|
||||
}
|
||||
}
|
||||
else if (opt.require_swap)
|
||||
else if (scratch_buf)
|
||||
{
|
||||
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
verify(HERE), opt.require_deswizzle || opt.require_swap;
|
||||
|
||||
const auto block_start = copy_regions.front().bufferOffset;
|
||||
insert_buffer_memory_barrier(cmd, scratch_buf->value, block_start, scratch_offset, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
|
||||
|
||||
vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, (u32)copy_regions.size(), copy_regions.data());
|
||||
|
Loading…
Reference in New Issue
Block a user