1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-23 03:02:53 +01:00

vk: Reimplement DMA synchronization

This commit is contained in:
kd-11 2019-08-17 14:06:38 +03:00 committed by kd-11
parent f06559412e
commit 212ac19c11
9 changed files with 539 additions and 52 deletions

View File

@ -358,6 +358,7 @@ target_sources(rpcs3_emu PRIVATE
if(TARGET 3rdparty_vulkan)
target_sources(rpcs3_emu PRIVATE
RSX/VK/VKCommonDecompiler.cpp
RSX/VK/VKDMA.cpp
RSX/VK/VKFormats.cpp
RSX/VK/VKFragmentProgram.cpp
RSX/VK/VKFramebuffer.cpp

View File

@ -1401,7 +1401,7 @@ namespace rsx
}
}
void imp_flush()
virtual void imp_flush()
{
AUDIT(synchronized);

372
rpcs3/Emu/RSX/VK/VKDMA.cpp Normal file
View File

@ -0,0 +1,372 @@
#include "stdafx.h"
#include "VKHelpers.h"
#include "VKResourceManager.h"
#include "VKDMA.h"
namespace vk
{
static constexpr size_t s_dma_block_length = 0x01000000;
static constexpr u32 s_dma_block_mask = 0xFF000000;
static constexpr u32 s_dma_offset_mask = 0x00FFFFFF;
static constexpr u32 s_page_size = 16384;
static constexpr u32 s_page_align = s_page_size - 1;
static constexpr u32 s_pages_per_entry = 32;
static constexpr u32 s_bits_per_page = 2;
static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry);
std::unordered_map<u32, dma_block> g_dma_pool;
void* dma_block::map_range(const utils::address_range& range)
{
if (inheritance_info.parent)
{
return inheritance_info.parent->map_range(range);
}
verify(HERE), range.start >= base_address;
u32 start = range.start;
start -= base_address;
return allocated_memory->map(start, range.length());
}
void dma_block::unmap()
{
if (inheritance_info.parent)
{
inheritance_info.parent->unmap();
}
else
{
allocated_memory->unmap();
}
}
void dma_block::init(const render_device& dev, u32 addr, size_t size)
{
verify(HERE), size, !(size % s_dma_block_length);
base_address = addr;
allocated_memory = std::make_unique<vk::buffer>(dev, size,
dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
page_info.resize(size / s_bytes_per_entry, ~0ull);
}
void dma_block::init(dma_block* parent, u32 addr, size_t size)
{
base_address = addr;
inheritance_info.parent = parent;
inheritance_info.block_offset = (addr - parent->base_address);
}
void dma_block::set_page_bit(u32 offset, u64 bits)
{
const auto entry = (offset / s_bytes_per_entry);
const auto word = entry / s_pages_per_entry;
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
page_info[word] &= ~(3 << shift);
page_info[word] |= (bits << shift);
}
bool dma_block::test_page_bit(u32 offset, u64 bits)
{
const auto entry = (offset / s_bytes_per_entry);
const auto word = entry / s_pages_per_entry;
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
return !!(page_info[word] & (bits << shift));
}
void dma_block::mark_dirty(const utils::address_range& range)
{
if (!inheritance_info.parent)
{
const u32 start = align(range.start, s_page_size);
const u32 end = ((range.end + 1) & s_page_align);
for (u32 page = start; page < end; page += s_page_size)
{
set_page_bit(page - base_address, page_bits::dirty);
}
if (UNLIKELY(start > range.start))
{
set_page_bit(start - s_page_size, page_bits::nocache);
}
if (UNLIKELY(end < range.end))
{
set_page_bit(end + s_page_size, page_bits::nocache);
}
}
else
{
inheritance_info.parent->mark_dirty(range);
}
}
void dma_block::set_page_info(u32 page_offset, const std::vector<u64>& bits)
{
if (!inheritance_info.parent)
{
auto bit_offset = page_offset / s_bytes_per_entry;
verify(HERE), (bit_offset + bits.size()) <= page_info.size();
std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size());
}
else
{
inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits);
}
}
void dma_block::flush(const utils::address_range& range)
{
auto src = map_range(range);
auto dst = vm::get_super_ptr(range.start);
std::memcpy(dst, src, range.length());
// TODO: Clear page bits
unmap();
}
void dma_block::load(const utils::address_range& range)
{
auto src = vm::get_super_ptr(range.start);
auto dst = map_range(range);
std::memcpy(dst, src, range.length());
// TODO: Clear page bits to sychronized
unmap();
}
std::pair<u32, buffer*> dma_block::get(const utils::address_range& range)
{
if (inheritance_info.parent)
{
return inheritance_info.parent->get(range);
}
verify(HERE), range.start >= base_address, range.end <= end();
// mark_dirty(range);
return { (range.start - base_address), allocated_memory.get() };
}
dma_block* dma_block::head()
{
if (!inheritance_info.parent)
return this;
return inheritance_info.parent->head();
}
const dma_block* dma_block::head() const
{
if (!inheritance_info.parent)
return this;
return inheritance_info.parent->head();
}
void dma_block::set_parent(command_buffer& cmd, dma_block* parent)
{
verify(HERE), parent;
if (inheritance_info.parent == parent)
{
// Nothing to do
return;
}
inheritance_info.parent = parent;
inheritance_info.block_offset = (base_address - parent->base_address);
if (allocated_memory)
{
VkBufferCopy copy{};
copy.srcOffset = 0;
copy.dstOffset = inheritance_info.block_offset;
copy.size = allocated_memory->size();
vkCmdCopyBuffer(cmd, allocated_memory->value, parent->allocated_memory->value, 1, &copy);
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
parent->set_page_info(inheritance_info.block_offset, page_info);
page_info.clear();
}
}
void dma_block::extend(command_buffer& cmd, const render_device &dev, size_t new_size)
{
verify(HERE), allocated_memory;
if (new_size <= allocated_memory->size())
return;
const auto required_entries = new_size / s_bytes_per_entry;
page_info.resize(required_entries, ~0ull);
auto new_allocation = std::make_unique<vk::buffer>(dev, new_size,
dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
VkBufferCopy copy{};
copy.size = allocated_memory->size();
vkCmdCopyBuffer(cmd, allocated_memory->value, new_allocation->value, 1, &copy);
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
allocated_memory = std::move(new_allocation);
}
u32 dma_block::start() const
{
return base_address;
}
u32 dma_block::end() const
{
auto source = head();
return (source->base_address + source->allocated_memory->size() - 1);
}
u32 dma_block::size() const
{
return (allocated_memory) ? allocated_memory->size() : 0;
}
std::pair<u32, vk::buffer*> map_dma(command_buffer& cmd, u32 local_address, u32 length)
{
const auto map_range = utils::address_range::start_length(local_address, length);
const auto first_block = (local_address & s_dma_block_mask);
const auto limit = local_address + length - 1;
auto last_block = (limit & s_dma_block_mask);
if (LIKELY(first_block == last_block))
{
if (auto found = g_dma_pool.find(first_block); found != g_dma_pool.end())
{
return found->second.get(map_range);
}
auto &block_info = g_dma_pool[first_block];
block_info.init(*vk::get_current_renderer(), first_block, s_dma_block_length);
return block_info.get(map_range);
}
dma_block* block_head = nullptr;
auto block_end = align(limit, s_dma_block_length);
// Reverse scan to try and find the minimum required length in case of other chaining
for (auto block = last_block; block != first_block; block -= s_dma_block_length)
{
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
{
const auto end = found->second.end();
last_block = std::max(last_block, end & s_dma_block_mask);
block_end = std::max(block_end, end + 1);
break;
}
}
for (auto block = first_block; block <= last_block; block += s_dma_block_length)
{
auto found = g_dma_pool.find(block);
const bool exists = (found != g_dma_pool.end());
auto entry = exists ? &found->second : &g_dma_pool[block];
if (block == first_block)
{
block_head = entry->head();
if (exists)
{
if (entry->end() < limit)
{
auto new_length = block_end - block_head->start();
block_head->extend(cmd, *vk::get_current_renderer(), new_length);
}
}
else
{
auto required_size = (block_end - block);
block_head->init(*vk::get_current_renderer(), block, required_size);
}
}
else
{
if (exists)
{
entry->set_parent(cmd, block_head);
}
else
{
entry->init(block_head, block, s_dma_block_length);
}
}
}
verify(HERE), block_head;
return block_head->get(map_range);
}
template<bool load>
void sync_dma_impl(u32 local_address, u32 length)
{
const auto limit = local_address + length - 1;
while (length)
{
u32 block = (local_address & s_dma_block_mask);
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
{
const auto sync_end = std::min(limit, found->second.end());
const auto range = utils::address_range::start_end(local_address, sync_end);
if constexpr (load)
{
found->second.load(range);
}
else
{
found->second.flush(range);
}
if (UNLIKELY(sync_end < limit))
{
// Technically legal but assuming a map->flush usage, this shouldnot happen
// Optimizations could in theory batch together multiple transfers though
LOG_ERROR(RSX, "Sink request spans multiple allocated blocks!");
const auto write_end = (sync_end + 1u);
const auto written = (write_end - local_address);
length -= written;
local_address = write_end;
continue;
}
break;
}
else
{
LOG_ERROR(RSX, "Sync command on range not mapped!");
return;
}
}
}
void load_dma(u32 local_address, u32 length)
{
sync_dma_impl<true>(local_address, length);
}
void flush_dma(u32 local_address, u32 length)
{
sync_dma_impl<false>(local_address, length);
}
void clear_dma_resources()
{
g_dma_pool.clear();
}
}

57
rpcs3/Emu/RSX/VK/VKDMA.h Normal file
View File

@ -0,0 +1,57 @@
#pragma once
#include "VKHelpers.h"
namespace vk
{
std::pair<u32, vk::buffer*> map_dma(command_buffer& cmd, u32 local_address, u32 length);
void load_dma(u32 local_address, u32 length);
void flush_dma(u32 local_address, u32 length);
void clear_dma_resources();
class dma_block
{
enum page_bits
{
synchronized = 0,
dirty = 1,
nocache = 3
};
struct
{
dma_block* parent = nullptr;
u32 block_offset = 0;
}
inheritance_info;
u32 base_address = 0;
std::unique_ptr<buffer> allocated_memory;
std::vector<u64> page_info;
void* map_range(const utils::address_range& range);
void unmap();
void set_page_bit(u32 page, u64 bits);
bool test_page_bit(u32 page, u64 bits);
void mark_dirty(const utils::address_range& range);
void set_page_info(u32 page_offset, const std::vector<u64>& bits);
public:
void init(const render_device& dev, u32 addr, size_t size);
void init(dma_block* parent, u32 addr, size_t size);
void flush(const utils::address_range& range);
void load(const utils::address_range& range);
std::pair<u32, buffer*> get(const utils::address_range& range);
u32 start() const;
u32 end() const;
u32 size() const;
dma_block* head();
const dma_block* head() const;
void set_parent(command_buffer& cmd, dma_block* parent);
void extend(command_buffer& cmd, const render_device& dev, size_t new_size);
};
}

View File

@ -6,6 +6,7 @@
#include "VKFramebuffer.h"
#include "VKResolveHelper.h"
#include "VKResourceManager.h"
#include "VKDMA.h"
#include "Utilities/mutex.h"
namespace vk
@ -265,6 +266,7 @@ namespace vk
vk::clear_renderpass_cache(dev);
vk::clear_framebuffer_cache();
vk::clear_resolve_helpers();
vk::clear_dma_resources();
vk::get_resource_manager()->destroy();
g_null_texture.reset();

View File

@ -4,6 +4,7 @@
#include "VKGSRender.h"
#include "VKCompute.h"
#include "VKResourceManager.h"
#include "VKDMA.h"
#include "Emu/System.h"
#include "../Common/TextureUtils.h"
#include "Utilities/mutex.h"
@ -39,7 +40,6 @@ namespace vk
VkEvent dma_fence = VK_NULL_HANDLE;
vk::render_device* m_device = nullptr;
vk::viewable_image *vram_texture = nullptr;
std::unique_ptr<vk::buffer> dma_buffer;
public:
using baseclass::cached_texture_section;
@ -73,7 +73,7 @@ namespace vk
if (!flushed)
{
// Reset fence
verify(HERE), m_device, dma_buffer, dma_fence;
verify(HERE), m_device, dma_fence;
vk::get_resource_manager()->dispose(dma_fence);
}
@ -88,10 +88,9 @@ namespace vk
void release_dma_resources()
{
if (dma_buffer)
if (dma_fence)
{
auto gc = vk::get_resource_manager();
gc->dispose(dma_buffer);
gc->dispose(dma_fence);
}
}
@ -187,12 +186,6 @@ namespace vk
vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
}
if (!dma_buffer)
{
auto memory_type = m_device->get_memory_mapping().host_visible_coherent;
dma_buffer = std::make_unique<vk::buffer>(*m_device, align(get_section_size(), 256), memory_type, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
}
vk::image *locked_resource = vram_texture;
u32 transfer_width = width;
u32 transfer_height = height;
@ -230,21 +223,52 @@ namespace vk
verify(HERE), target->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
// Handle any format conversions using compute tasks
vk::cs_shuffle_base *shuffle_kernel = nullptr;
// TODO: Read back stencil values (is this really necessary?)
const auto internal_bpp = vk::get_format_texel_width(vram_texture->format());
const auto valid_range = get_confirmed_range();
real_pitch = internal_bpp * transfer_width;
if (vram_texture->format() == VK_FORMAT_D24_UNORM_S8_UINT)
u32 transfer_x = 0, transfer_y = 0;
if (const auto section_range = get_section_range(); section_range != valid_range)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_se_d24x8>();
if (const auto offset = (valid_range.start - get_section_base()))
{
transfer_y = offset / rsx_pitch;
transfer_x = (offset % rsx_pitch) / internal_bpp;
verify(HERE), transfer_width >= transfer_x, transfer_height >= transfer_y;
transfer_width -= transfer_x;
transfer_height -= transfer_y;
}
if (const auto tail = (section_range.end - valid_range.end))
{
const auto row_count = tail / rsx_pitch;
verify(HERE), transfer_height >= row_count;
transfer_height -= row_count;
}
}
else if (vram_texture->format() == VK_FORMAT_D32_SFLOAT_S8_UINT)
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_se_f32_d24x8>();
}
else if (pack_unpack_swap_bytes)
if ((vram_texture->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) ||
pack_unpack_swap_bytes)
{
const auto section_length = valid_range.length();
const auto transfer_pitch = transfer_width * internal_bpp;
const auto task_length = transfer_pitch * transfer_height;
auto working_buffer = vk::get_scratch_buffer();
auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
VkBufferImageCopy region = {};
region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 };
region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
vk::copy_image_to_buffer(cmd, target, working_buffer, region);
const auto texel_layout = vk::get_format_element_size(vram_texture->format());
const auto elem_size = texel_layout.first;
vk::cs_shuffle_base *shuffle_kernel;
if (elem_size == 2)
{
@ -254,38 +278,60 @@ namespace vk
{
shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
}
}
else
{
fmt::throw_exception("Unreachable" HERE);
}
// Do not run the compute task on host visible memory
vk::buffer* mem_target = shuffle_kernel ? vk::get_scratch_buffer() : dma_buffer.get();
// TODO: Read back stencil values (is this really necessary?)
VkBufferImageCopy region = {};
region.imageSubresource = {vram_texture->aspect() & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
region.imageExtent = {transfer_width, transfer_height, 1};
vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mem_target->value, 1, &region);
locked_resource->pop_layout(cmd);
real_pitch = vk::get_format_texel_width(vram_texture->format()) * transfer_width;
if (shuffle_kernel)
{
verify (HERE), mem_target->value != dma_buffer->value;
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, get_section_size(),
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
shuffle_kernel->run(cmd, mem_target, get_section_size());
shuffle_kernel->run(cmd, working_buffer, task_length);
vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, get_section_size(),
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
VkBufferCopy copy = {};
copy.size = get_section_size();
vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, &copy);
if (LIKELY(rsx_pitch == real_pitch))
{
VkBufferCopy copy = {};
copy.dstOffset = final_mapping.first;
copy.size = section_length;
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
}
else
{
std::vector<VkBufferCopy> copy;
copy.reserve(transfer_height);
u32 dst_offset = final_mapping.first;
u32 src_offset = 0;
for (unsigned row = 0; row < transfer_height; ++row)
{
copy.push_back({src_offset, dst_offset, transfer_pitch});
src_offset += real_pitch;
dst_offset += rsx_pitch;
}
vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
}
}
else
{
VkBufferImageCopy region = {};
region.bufferRowLength = (rsx_pitch / internal_bpp);
region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 };
region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 };
region.imageExtent = { transfer_width, transfer_height, 1 };
auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
region.bufferOffset = mapping.first;
vkCmdCopyImageToBuffer(cmd, target->value, target->current_layout, mapping.second->value, 1, &region);
}
locked_resource->pop_layout(cmd);
if (UNLIKELY(synchronized))
{
@ -314,7 +360,7 @@ namespace vk
/**
* Flush
*/
void* map_synchronized(u32 offset, u32 size)
void imp_flush() override
{
AUDIT(synchronized);
@ -322,12 +368,8 @@ namespace vk
vk::wait_for_event(dma_fence, GENERAL_WAIT_TIMEOUT);
vkResetEvent(*m_device, dma_fence);
return dma_buffer->map(offset, size);
}
void finish_flush()
{
dma_buffer->unmap();
const auto range = get_confirmed_range();
vk::flush_dma(range.start, range.length());
if (context == rsx::texture_upload_context::framebuffer_storage)
{
@ -336,6 +378,11 @@ namespace vk
}
}
void *map_synchronized(u32, u32)
{ return nullptr; }
void finish_flush()
{}
/**
* Misc

View File

@ -46,7 +46,7 @@ namespace rsx
u8* buf = buffer;
// Read the whole buffer from source
for (u32 y = 0; y < clip_h; ++y)
for (int y = 0; y < clip_h; ++y)
{
std::memcpy(buf, src, buffer_pitch);
src += src_pitch;
@ -56,7 +56,7 @@ namespace rsx
buf = buffer;
// Write to destination
for (u32 y = 0; y < clip_h; ++y)
for (int y = 0; y < clip_h; ++y)
{
std::memcpy(dst, buf, buffer_pitch);
dst += dst_pitch;

View File

@ -25,6 +25,7 @@
<ItemGroup>
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />
<ClInclude Include="Emu\RSX\VK\VKCompute.h" />
<ClInclude Include="Emu\RSX\VK\VKDMA.h" />
<ClInclude Include="Emu\RSX\VK\VKFormats.h" />
<ClInclude Include="Emu\RSX\VK\VKFragmentProgram.h" />
<ClInclude Include="Emu\RSX\VK\VKFramebuffer.h" />
@ -43,6 +44,7 @@
</ItemGroup>
<ItemGroup>
<ClCompile Include="Emu\RSX\VK\VKCommonDecompiler.cpp" />
<ClCompile Include="Emu\RSX\VK\VKDMA.cpp" />
<ClCompile Include="Emu\RSX\VK\VKFormats.cpp" />
<ClCompile Include="Emu\RSX\VK\VKFragmentProgram.cpp" />
<ClCompile Include="Emu\RSX\VK\VKFramebuffer.cpp" />

View File

@ -58,6 +58,9 @@
<ClInclude Include="Emu\RSX\VK\VKFramebuffer.h">
<Filter>Source Files</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\VK\VKDMA.h">
<Filter>Source Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="Emu\RSX\VK\VKGSRender.cpp">
@ -105,5 +108,8 @@
<ClCompile Include="Emu\RSX\VK\VKFramebuffer.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\VK\VKDMA.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>