From 58d367d7049ab0a64b9600cc3f962fc667103831 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 10 Jan 2021 20:25:02 +0300 Subject: [PATCH] vk: Add host-imported DMA buffers --- rpcs3/Emu/RSX/VK/VKDMA.cpp | 120 +++++++++++++++------ rpcs3/Emu/RSX/VK/VKDMA.h | 30 ++++-- rpcs3/Emu/RSX/VK/vkutils/buffer_object.cpp | 47 ++++++++ rpcs3/Emu/RSX/VK/vkutils/buffer_object.h | 1 + rpcs3/Emu/RSX/VK/vkutils/device.cpp | 11 ++ rpcs3/Emu/RSX/VK/vkutils/device.h | 3 + rpcs3/Emu/RSX/VK/vkutils/memory.cpp | 48 ++++++++- rpcs3/Emu/RSX/VK/vkutils/memory.h | 35 ++++-- 8 files changed, 245 insertions(+), 50 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp index 8e476bd647..166bf52879 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.cpp +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -10,17 +10,17 @@ namespace vk { - static constexpr usz s_dma_block_length = 0x01000000; - static constexpr u32 s_dma_block_mask = 0xFF000000; - //static constexpr u32 s_dma_offset_mask = 0x00FFFFFF; + static constexpr usz s_dma_block_length = 0x00001000;//0x01000000; + static constexpr u32 s_dma_block_mask = 0xFFFFF000;//0xFF000000; + //static constexpr u32 s_dma_offset_mask = 0x00000FFF;//0x00FFFFFF; - static constexpr u32 s_page_size = 16384; - static constexpr u32 s_page_align = s_page_size - 1; - static constexpr u32 s_pages_per_entry = 32; - static constexpr u32 s_bits_per_page = 2; - static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry); + static constexpr u32 s_page_size = 16384; + static constexpr u32 s_page_align = s_page_size - 1; + static constexpr u32 s_pages_per_entry = 32; + static constexpr u32 s_bits_per_page = 2; + static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry); - std::unordered_map g_dma_pool; + std::unordered_map> g_dma_pool; void* dma_block::map_range(const utils::address_range& range) { @@ -47,16 +47,28 @@ namespace vk } } + void dma_block::allocate(const render_device& dev, usz size) + { + if (allocated_memory) + { + // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline + // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. + auto gc = vk::get_resource_manager(); + gc->dispose(allocated_memory); + } + + allocated_memory = std::make_unique(dev, size, + dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); + } + void dma_block::init(const render_device& dev, u32 addr, usz size) { ensure(size); ensure(!(size % s_dma_block_length)); base_address = addr; - allocated_memory = std::make_unique(dev, size, - dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); - + allocate(dev, size); page_info.resize(size / s_bytes_per_entry, ~0ull); } @@ -70,7 +82,7 @@ namespace vk void dma_block::set_page_bit(u32 offset, u64 bits) { const auto entry = (offset / s_bytes_per_entry); - const auto word = entry / s_pages_per_entry; + const auto word = entry / s_pages_per_entry; const auto shift = (entry % s_pages_per_entry) * s_bits_per_page; page_info[word] &= ~(3 << shift); @@ -202,24 +214,16 @@ namespace vk } } - void dma_block::extend(const command_buffer& cmd, const render_device &dev, usz new_size) + void dma_block::extend(const command_buffer& cmd, const render_device& dev, usz new_size) { ensure(allocated_memory); if (new_size <= allocated_memory->size()) return; + allocate(dev, new_size); + const auto required_entries = new_size / s_bytes_per_entry; page_info.resize(required_entries, ~0ull); - - auto new_allocation = std::make_unique(dev, new_size, - dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); - - // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline - // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. - auto gc = vk::get_resource_manager(); - gc->dispose(allocated_memory); - allocated_memory = std::move(new_allocation); } u32 dma_block::start() const @@ -238,6 +242,48 @@ namespace vk return (allocated_memory) ? allocated_memory->size() : 0; } + void dma_block_EXT::allocate(const render_device& dev, usz size) + { + if (allocated_memory) + { + // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline + // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. + auto gc = vk::get_resource_manager(); + gc->dispose(allocated_memory); + } + + allocated_memory = std::make_unique(dev, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + vm::get_super_ptr(base_address), + size); + } + + void* dma_block_EXT::map_range(const utils::address_range& range) + { + return vm::get_super_ptr(range.start); + } + + void dma_block_EXT::unmap() + { + // NOP + } + + void dma_block_EXT::flush(const utils::address_range& range) + { + // NOP + } + + void dma_block_EXT::load(const utils::address_range& range) + { + // NOP + } + + void create_dma_block(std::unique_ptr& block) + { + // TODO + block.reset(new dma_block_EXT()); + } + std::pair map_dma(const command_buffer& cmd, u32 local_address, u32 length) { const auto map_range = utils::address_range::start_length(local_address, length); @@ -247,17 +293,19 @@ namespace vk if (auto found = g_dma_pool.find(first_block); found != g_dma_pool.end()) { - if (found->second.end() >= limit) + if (found->second->end() >= limit) { - return found->second.get(map_range); + return found->second->get(map_range); } } if (first_block == last_block) [[likely]] { auto &block_info = g_dma_pool[first_block]; - block_info.init(*g_render_device, first_block, s_dma_block_length); - return block_info.get(map_range); + if (!block_info) create_dma_block(block_info); + + block_info->init(*g_render_device, first_block, s_dma_block_length); + return block_info->get(map_range); } dma_block* block_head = nullptr; @@ -268,7 +316,7 @@ namespace vk { if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) { - const auto end = found->second.end(); + const auto end = found->second->end(); last_block = std::max(last_block, end & s_dma_block_mask); block_end = std::max(block_end, end + 1); @@ -279,8 +327,10 @@ namespace vk for (auto block = first_block; block <= last_block; block += s_dma_block_length) { auto found = g_dma_pool.find(block); - const bool exists = (found != g_dma_pool.end()); - auto entry = exists ? &found->second : &g_dma_pool[block]; + auto &entry = g_dma_pool[block]; + + const bool exists = !!entry; + if (!exists) create_dma_block(entry); if (block == first_block) { @@ -326,16 +376,16 @@ namespace vk u32 block = (local_address & s_dma_block_mask); if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) { - const auto sync_end = std::min(limit, found->second.end()); + const auto sync_end = std::min(limit, found->second->end()); const auto range = utils::address_range::start_end(local_address, sync_end); if constexpr (load) { - found->second.load(range); + found->second->load(range); } else { - found->second.flush(range); + found->second->flush(range); } if (sync_end < limit) [[unlikely]] diff --git a/rpcs3/Emu/RSX/VK/VKDMA.h b/rpcs3/Emu/RSX/VK/VKDMA.h index 6dbd6104f5..cfdeb146b1 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.h +++ b/rpcs3/Emu/RSX/VK/VKDMA.h @@ -12,6 +12,7 @@ namespace vk class dma_block { + protected: enum page_bits { synchronized = 0, @@ -30,8 +31,9 @@ namespace vk std::unique_ptr allocated_memory; std::vector page_info; - void* map_range(const utils::address_range& range); - void unmap(); + virtual void allocate(const render_device& dev, usz size); + virtual void* map_range(const utils::address_range& range); + virtual void unmap(); void set_page_bit(u32 page, u64 bits); bool test_page_bit(u32 page, u64 bits); @@ -40,10 +42,10 @@ namespace vk public: - void init(const render_device& dev, u32 addr, usz size); - void init(dma_block* parent, u32 addr, usz size); - void flush(const utils::address_range& range); - void load(const utils::address_range& range); + virtual void init(const render_device& dev, u32 addr, usz size); + virtual void init(dma_block* parent, u32 addr, usz size); + virtual void flush(const utils::address_range& range); + virtual void load(const utils::address_range& range); std::pair get(const utils::address_range& range); u32 start() const; @@ -52,7 +54,19 @@ namespace vk dma_block* head(); const dma_block* head() const; - void set_parent(const command_buffer& cmd, dma_block* parent); - void extend(const command_buffer& cmd, const render_device& dev, usz new_size); + virtual void set_parent(const command_buffer& cmd, dma_block* parent); + virtual void extend(const command_buffer& cmd, const render_device& dev, usz new_size); + }; + + class dma_block_EXT: public dma_block + { + private: + void allocate(const render_device& dev, usz size) override; + void* map_range(const utils::address_range& range) override; + void unmap() override; + + public: + void flush(const utils::address_range& range) override; + void load(const utils::address_range& range) override; }; } diff --git a/rpcs3/Emu/RSX/VK/vkutils/buffer_object.cpp b/rpcs3/Emu/RSX/VK/vkutils/buffer_object.cpp index 8f8c98b21f..a9f23a9647 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/buffer_object.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/buffer_object.cpp @@ -66,6 +66,53 @@ namespace vk vkBindBufferMemory(dev, value, memory->get_vk_device_memory(), memory->get_vk_device_memory_offset()); } + buffer::buffer(const vk::render_device& dev, VkBufferUsageFlags usage, void* host_pointer, u64 size) + : m_device(dev) + { + info.size = size; + info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + info.flags = 0; + info.usage = usage; + + VkExternalMemoryBufferCreateInfoKHR ex_info; + ex_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR; + ex_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; + ex_info.pNext = nullptr; + + info.pNext = &ex_info; + CHECK_RESULT(vkCreateBuffer(m_device, &info, nullptr, &value)); + + auto& memory_map = dev.get_memory_mapping(); + u32 memory_type_index = memory_map.host_visible_coherent; + VkFlags access_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + + ensure(memory_map.getMemoryHostPointerPropertiesEXT); + + VkMemoryHostPointerPropertiesEXT memory_properties{}; + memory_properties.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT; + memory_map.getMemoryHostPointerPropertiesEXT(dev, VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, host_pointer, &memory_properties); + + VkMemoryRequirements memory_reqs; + vkGetBufferMemoryRequirements(m_device, value, &memory_reqs); + + auto required_memory_type_bits = memory_reqs.memoryTypeBits & memory_properties.memoryTypeBits; + if (!required_memory_type_bits) + { + // AMD driver bug. Buffers created with external memory extension return type bits of 0 + rsx_log.warning("Could not match buffer requirements and host pointer properties."); + required_memory_type_bits = memory_properties.memoryTypeBits; + } + + if (!dev.get_compatible_memory_type(required_memory_type_bits, access_flags, &memory_type_index)) + { + fmt::throw_exception("No compatible memory type was found!"); + } + + memory = std::make_unique(m_device, host_pointer, size, memory_type_index); + vkBindBufferMemory(dev, value, memory->get_vk_device_memory(), memory->get_vk_device_memory_offset()); + } + buffer::~buffer() { vkDestroyBuffer(m_device, value, nullptr); diff --git a/rpcs3/Emu/RSX/VK/vkutils/buffer_object.h b/rpcs3/Emu/RSX/VK/vkutils/buffer_object.h index 3957cd3f1d..f82c9819ef 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/buffer_object.h +++ b/rpcs3/Emu/RSX/VK/vkutils/buffer_object.h @@ -30,6 +30,7 @@ namespace vk std::unique_ptr memory; buffer(const vk::render_device& dev, u64 size, u32 memory_type_index, u32 access_flags, VkBufferUsageFlags usage, VkBufferCreateFlags flags); + buffer(const vk::render_device& dev, VkBufferUsageFlags usage, void* host_pointer, u64 size); ~buffer(); void* map(u64 offset, u64 size); diff --git a/rpcs3/Emu/RSX/VK/vkutils/device.cpp b/rpcs3/Emu/RSX/VK/vkutils/device.cpp index 6c7d76d4f1..123d58fac5 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/device.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/device.cpp @@ -56,6 +56,7 @@ namespace vk stencil_export_support = device_extensions.is_supported(VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME); conditional_render_support = device_extensions.is_supported(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME); + external_memory_host_support = device_extensions.is_supported(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); unrestricted_depth_range_support = device_extensions.is_supported(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); } @@ -262,6 +263,11 @@ namespace vk requested_extensions.push_back(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); } + if (pgpu->external_memory_host_support) + { + requested_extensions.push_back(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); + } + enabled_features.robustBufferAccess = VK_TRUE; enabled_features.fullDrawIndexUint32 = VK_TRUE; enabled_features.independentBlend = VK_TRUE; @@ -362,6 +368,11 @@ namespace vk m_formats_support = vk::get_optimal_tiling_supported_formats(pdev); m_pipeline_binding_table = vk::get_pipeline_binding_table(pdev); + if (pgpu->external_memory_host_support) + { + memory_map.getMemoryHostPointerPropertiesEXT = reinterpret_cast(vkGetDeviceProcAddr(dev, "vkGetMemoryHostPointerPropertiesEXT")); + } + if (g_cfg.video.disable_vulkan_mem_allocator) m_allocator = std::make_unique(dev, pdev); else diff --git a/rpcs3/Emu/RSX/VK/vkutils/device.h b/rpcs3/Emu/RSX/VK/vkutils/device.h index 46c64aa56b..c3bcf293b3 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/device.h +++ b/rpcs3/Emu/RSX/VK/vkutils/device.h @@ -30,6 +30,8 @@ namespace vk { u32 host_visible_coherent; u32 device_local; + + PFN_vkGetMemoryHostPointerPropertiesEXT getMemoryHostPointerPropertiesEXT; }; class physical_device @@ -47,6 +49,7 @@ namespace vk bool stencil_export_support = false; bool conditional_render_support = false; + bool external_memory_host_support = false; bool unrestricted_depth_range_support = false; friend class render_device; diff --git a/rpcs3/Emu/RSX/VK/vkutils/memory.cpp b/rpcs3/Emu/RSX/VK/vkutils/memory.cpp index 2b42f28f4d..aa73bc44cb 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/memory.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/memory.cpp @@ -189,7 +189,53 @@ namespace vk memory_block::~memory_block() { - m_mem_allocator->free(m_mem_handle); + if (m_mem_allocator) + { + m_mem_allocator->free(m_mem_handle); + } + } + + memory_block_host::memory_block_host(VkDevice dev, void* host_pointer, u64 size, u32 memory_type_index) : + m_device(dev), m_mem_handle(VK_NULL_HANDLE), m_host_pointer(host_pointer) + { + VkMemoryAllocateInfo alloc_info{}; + VkImportMemoryHostPointerInfoEXT import_info{}; + + alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + alloc_info.memoryTypeIndex = memory_type_index; + alloc_info.allocationSize = size; + alloc_info.pNext = &import_info; + + import_info.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT; + import_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; + import_info.pHostPointer = host_pointer; + + CHECK_RESULT(vkAllocateMemory(m_device, &alloc_info, nullptr, &m_mem_handle)); + } + + memory_block_host::~memory_block_host() + { + vkFreeMemory(m_device, m_mem_handle, nullptr); + } + + VkDeviceMemory memory_block_host::get_vk_device_memory() + { + return m_mem_handle; + } + + u64 memory_block_host::get_vk_device_memory_offset() + { + return 0ull; + } + + void* memory_block_host::map(u64 offset, u64 size) + { + return reinterpret_cast(m_host_pointer) + offset; + } + + void memory_block_host::unmap() + { + // NOP } VkDeviceMemory memory_block::get_vk_device_memory() diff --git a/rpcs3/Emu/RSX/VK/vkutils/memory.h b/rpcs3/Emu/RSX/VK/vkutils/memory.h index 61ff3b21e0..7452f85ad3 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/memory.h +++ b/rpcs3/Emu/RSX/VK/vkutils/memory.h @@ -82,23 +82,46 @@ namespace vk struct memory_block { memory_block(VkDevice dev, u64 block_sz, u64 alignment, u32 memory_type_index); - ~memory_block(); + virtual ~memory_block(); - VkDeviceMemory get_vk_device_memory(); - u64 get_vk_device_memory_offset(); + virtual VkDeviceMemory get_vk_device_memory(); + virtual u64 get_vk_device_memory_offset(); - void* map(u64 offset, u64 size); - void unmap(); + virtual void* map(u64 offset, u64 size); + virtual void unmap(); memory_block(const memory_block&) = delete; memory_block(memory_block&&) = delete; + protected: + memory_block() = default; + private: VkDevice m_device; - vk::mem_allocator_base* m_mem_allocator; + vk::mem_allocator_base* m_mem_allocator = nullptr; mem_allocator_base::mem_handle_t m_mem_handle; }; + struct memory_block_host : public memory_block + { + memory_block_host(VkDevice dev, void* host_pointer, u64 size, u32 memory_type_index); + ~memory_block_host(); + + VkDeviceMemory get_vk_device_memory() override; + u64 get_vk_device_memory_offset() override; + void* map(u64 offset, u64 size) override; + void unmap() override; + + memory_block_host(const memory_block_host&) = delete; + memory_block_host(memory_block_host&&) = delete; + memory_block_host() = delete; + + private: + VkDevice m_device; + VkDeviceMemory m_mem_handle; + void* m_host_pointer; + }; + void vmm_notify_memory_allocated(void* handle, u32 memory_type, u64 memory_size); void vmm_notify_memory_freed(void* handle); void vmm_reset();