rsx: Improvements to memory flush mechanism

- Batch dma transfers whenever possible and do them in one go - vk: Always ensure that queued dma transfers are visible to the GPU before they are needed by the host Requires a little refactoring to allow proper communication of the commandbuffer state - vk: Code cleanup, the simplified mechanism makes it so that its not necessary to pass tons of args to methods - vk: Fixup - do not forcefully do dma transfers on sections in an invalidation zone! They may have been speculated correctly already
2025-01-31 12:31:45 +01:00 · 2019-03-16 12:14:11 +03:00 · 2019-03-16 12:14:11 +03:00 · 5260f4b47d
commit 5260f4b47d
parent 385485204b
8 changed files with 178 additions and 140 deletions
--- a/rpcs3/Emu/RSX/Common/texture_cache.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache.h
@ -364,6 +364,8 @@ namespace rsx
 		virtual image_view_type generate_atlas_from_images(commandbuffer_type&, u32 gcm_format, u16 width, u16 height, const std::vector<copy_region_descriptor>& sections_to_copy, const texture_channel_remap_t& remap_vector) = 0;
 		virtual void update_image_contents(commandbuffer_type&, image_view_type dst, image_resource_type src, u16 width, u16 height) = 0;
 		virtual bool render_target_format_is_compatible(image_storage_type* tex, u32 gcm_format) = 0;
+		virtual void prepare_for_dma_transfers(commandbuffer_type&) = 0;
+		virtual void cleanup_after_dma_transfers(commandbuffer_type&) = 0;

 	public:
 		virtual void destroy() = 0;
@ -397,13 +399,13 @@ namespace rsx
 		template <typename... Args>
 		void err_once(const char* fmt, const Args&... params)
 		{
-			logs::RSX.error(fmt, params...);
+			emit_once(true, fmt, params...);
 		}

 		template <typename... Args>
 		void warn_once(const char* fmt, const Args&... params)
 		{
-			logs::RSX.warning(fmt, params...);
+			emit_once(false, fmt, params...);
 		}

 		/**
@ -458,19 +460,40 @@ namespace rsx
 				});
 			}

+			rsx::simple_array<section_storage_type*> sections_to_transfer;
 			for (auto &surface : data.sections_to_flush)
 			{
-				if (surface->get_memory_read_flags() == rsx::memory_read_flags::flush_always)
+				if (!surface->is_synchronized())
+				{
+					sections_to_transfer.push_back(surface);
+				}
+				else if (surface->get_memory_read_flags() == rsx::memory_read_flags::flush_always)
 				{
 					// This region is set to always read from itself (unavoidable hard sync)
 					const auto ROP_timestamp = rsx::get_current_renderer()->ROP_sync_timestamp;
-					if (surface->is_synchronized() && ROP_timestamp > surface->get_sync_timestamp())
+					if (ROP_timestamp > surface->get_sync_timestamp())
 					{
-						surface->copy_texture(cmd, true, std::forward<Args>(extras)...);
+						sections_to_transfer.push_back(surface);
 					}
 				}
+			}

-				surface->flush(cmd, std::forward<Args>(extras)...);
+			if (!sections_to_transfer.empty())
+			{
+				// Batch all hard faults together
+				prepare_for_dma_transfers(cmd);
+
+				for (auto &surface : sections_to_transfer)
+				{
+					surface->copy_texture(cmd, true, std::forward<Args>(extras)...);
+				}
+
+				cleanup_after_dma_transfers(cmd);
+			}
+
+			for (auto &surface : data.sections_to_flush)
+			{
+				surface->flush();

 				// Exclude this region when flushing other sections that should not trample it
 				// If we overlap an excluded RO, set it as dirty
@ -1224,7 +1247,7 @@ namespace rsx
 		}

 		template <typename ...FlushArgs, typename ...Args>
-		void lock_memory_region(commandbuffer_type& cmd, image_storage_type* image, const address_range &rsx_range, u32 width, u32 height, u32 pitch, std::tuple<FlushArgs...>&& flush_extras, Args&&... extras)
+		void lock_memory_region(commandbuffer_type& cmd, image_storage_type* image, const address_range &rsx_range, u32 width, u32 height, u32 pitch, Args&&... extras)
 		{
 			AUDIT(g_cfg.video.write_color_buffers || g_cfg.video.write_depth_buffer); // this method is only called when either WCB or WDB are enabled

@ -1244,10 +1267,7 @@ namespace rsx
 			if (!region.is_locked() || region.get_context() != texture_upload_context::framebuffer_storage)
 			{
 				// Invalidate sections from surface cache occupying same address range
-				std::apply(&texture_cache::invalidate_range_impl_base<FlushArgs...>, std::tuple_cat(
-					std::forward_as_tuple(this, cmd, rsx_range, invalidation_cause::superseded_by_fbo),
-					std::forward<std::tuple<FlushArgs...> >(flush_extras)
-				));
+				invalidate_range_impl_base(cmd, rsx_range, invalidation_cause::superseded_by_fbo);
 			}

 			if (!region.is_locked() || region.can_be_reused())
--- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h
@ -1303,14 +1303,10 @@ namespace rsx
 			return get_context() != texture_upload_context::shader_read && get_memory_read_flags() != memory_read_flags::flush_always;
 		}

-		void on_flush(bool miss)
+		void on_flush()
 		{
 			speculatively_flushed = false;

-			if (miss)
-			{
-				m_tex_cache->on_miss(*derived());
-			}
 			m_tex_cache->on_flush();

 			if (tracked_by_predictor())
@ -1328,6 +1324,12 @@ namespace rsx
 			m_tex_cache->on_speculative_flush();
 		}

+		void on_miss()
+		{
+			LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", get_section_base());
+			m_tex_cache->on_miss(*derived());
+		}
+
 		void touch(u64 tag)
 		{
 			last_write_tag = tag;
@ -1454,11 +1456,9 @@ namespace rsx

 	public:
 		// Returns false if there was a cache miss
-		template <typename ...Args>
-		bool flush(Args&&... extras)
+		void flush()
 		{
-			if (flushed) return true;
-			bool miss = false;
+			if (flushed) return;

 			// Sanity checks
 			ASSERT(exists());
@ -1469,19 +1469,12 @@ namespace rsx
 			{
 				flushed = true;
 				flush_exclusions.clear();
-				on_flush(miss);
-				return !miss;
+				on_flush();
+				return;
 			}

-			// If we are not synchronized, we must synchronize before proceeding (hard fault)
-			if (!synchronized)
-			{
-				LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", get_section_base());
-				derived()->synchronize(true, std::forward<Args>(extras)...);
-				miss = true;
-
-				ASSERT(synchronized); // TODO ruipin: This might be possible in OGL. Revisit
-			}
+			// NOTE: Hard faults should have been pre-processed beforehand
+			ASSERT(synchronized);

 			// Copy flush result to guest memory
 			imp_flush();
@ -1491,9 +1484,7 @@ namespace rsx
 			flushed = true;
 			derived()->finish_flush();
 			flush_exclusions.clear();
-			on_flush(miss);
-
-			return !miss;
+			on_flush();
 		}

 		void add_flush_exclusion(const address_range& rng)
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
@ -392,7 +392,7 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk
 		{
 			// Mark buffer regions as NO_ACCESS on Cell-visible side
 			m_gl_texture_cache.lock_memory_region(cmd, std::get<1>(m_rtts.m_bound_render_targets[i]), surface_range, m_surface_info[i].width, m_surface_info[i].height, m_surface_info[i].pitch,
-				std::tuple<>{}, color_format.format, color_format.type, color_format.swap_bytes);
+				color_format.format, color_format.type, color_format.swap_bytes);
 		}
 		else
 		{
@ -407,7 +407,7 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk
 		{
 			const auto depth_format_gl = rsx::internals::surface_depth_format_to_gl(layout.depth_format);
 			m_gl_texture_cache.lock_memory_region(cmd, std::get<1>(m_rtts.m_bound_depth_stencil), surface_range, m_depth_surface_info.width, m_depth_surface_info.height, m_depth_surface_info.pitch,
-				std::tuple<>{}, depth_format_gl.format, depth_format_gl.type, true);
+				depth_format_gl.format, depth_format_gl.type, true);
 		}
 		else
 		{
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -217,14 +217,18 @@ namespace gl
 			}
 		}

-		void copy_texture(gl::command_context& cmd, bool manage_lifetime)
+		void copy_texture(gl::command_context& cmd, bool miss)
 		{
 			ASSERT(exists());

-			if (!manage_lifetime)
+			if (LIKELY(!miss))
 			{
 				baseclass::on_speculative_flush();
 			}
+			else
+			{
+				baseclass::on_miss();
+			}

 			if (context == rsx::texture_upload_context::framebuffer_storage)
 			{
@ -347,15 +351,6 @@ namespace gl
 		/**
 		 * Flush
 		 */
-		void synchronize(bool blocking, gl::command_context& cmd)
-		{
-			if (synchronized)
-				return;
-
-			verify(HERE), cmd.drv;
-			copy_texture(cmd, blocking);
-		}
-
 		void* map_synchronized(u32 offset, u32 size)
 		{
 			AUDIT(synchronized && !m_fence.is_empty());
@ -642,7 +637,7 @@ namespace gl
 				if (src)
 				{
 					//Format mismatch
-					err_once("GL format mismatch (data cast?). Sized ifmt=0x%X vs Src ifmt=0x%X", sized_internal_fmt, (GLenum)ifmt);
+					warn_once("GL format mismatch (data cast?). Sized ifmt=0x%X vs Src ifmt=0x%X", sized_internal_fmt, (GLenum)ifmt);
 				}

 				//Apply base component map onto the new texture if a data cast has been done
@ -992,6 +987,12 @@ namespace gl
 			}
 		}

+		void prepare_for_dma_transfers(gl::command_context&) override
+		{}
+
+		void cleanup_after_dma_transfers(gl::command_context&) override
+		{}
+
 	public:

 		using baseclass::texture_cache;
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -855,7 +855,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 		std::lock_guard lock(m_secondary_cb_guard);

 		const rsx::invalidation_cause cause = is_writing ? rsx::invalidation_cause::deferred_write : rsx::invalidation_cause::deferred_read;
-		result = std::move(m_texture_cache.invalidate_address(m_secondary_command_buffer, address, cause, m_swapchain->get_graphics_queue()));
+		result = std::move(m_texture_cache.invalidate_address(m_secondary_command_buffer, address, cause));
 	}

 	if (!result.violation_handled)
@ -897,7 +897,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 			m_flush_requests.producer_wait();
 		}

-		m_texture_cache.flush_all(m_secondary_command_buffer, result, m_swapchain->get_graphics_queue());
+		m_texture_cache.flush_all(m_secondary_command_buffer, result);

 		if (has_queue_ref)
 		{
@ -913,7 +913,7 @@ void VKGSRender::on_invalidate_memory_range(const utils::address_range &range)
 {
 	std::lock_guard lock(m_secondary_cb_guard);

-	auto data = std::move(m_texture_cache.invalidate_range(m_secondary_command_buffer, range, rsx::invalidation_cause::unmap, m_swapchain->get_graphics_queue()));
+	auto data = std::move(m_texture_cache.invalidate_range(m_secondary_command_buffer, range, rsx::invalidation_cause::unmap));
 	AUDIT(data.empty());

 	if (data.violation_handled)
@ -1454,7 +1454,7 @@ void VKGSRender::end()

 				if (rsx::method_registers.fragment_textures[i].enabled())
 				{
-					*sampler_state = m_texture_cache._upload_texture(*m_current_command_buffer, rsx::method_registers.fragment_textures[i], m_rtts);
+					*sampler_state = m_texture_cache.upload_texture(*m_current_command_buffer, rsx::method_registers.fragment_textures[i], m_rtts);

 					const u32 texture_format = rsx::method_registers.fragment_textures[i].format() & ~(CELL_GCM_TEXTURE_UN | CELL_GCM_TEXTURE_LN);
 					const VkBool32 compare_enabled = (texture_format == CELL_GCM_TEXTURE_DEPTH16 || texture_format == CELL_GCM_TEXTURE_DEPTH24_D8 ||
@ -1526,7 +1526,7 @@ void VKGSRender::end()

 				if (rsx::method_registers.vertex_textures[i].enabled())
 				{
-					*sampler_state = m_texture_cache._upload_texture(*m_current_command_buffer, rsx::method_registers.vertex_textures[i], m_rtts);
+					*sampler_state = m_texture_cache.upload_texture(*m_current_command_buffer, rsx::method_registers.vertex_textures[i], m_rtts);

 					bool replace = !vs_sampler_handles[i];
 					const VkBool32 unnormalized_coords = !!(rsx::method_registers.vertex_textures[i].format() & CELL_GCM_TEXTURE_UN);
@ -1725,7 +1725,7 @@ void VKGSRender::end()
 		m_occlusion_map[m_active_query_info->driver_handle].indices.push_back(occlusion_id);
 		m_occlusion_map[m_active_query_info->driver_handle].command_buffer_to_wait = m_current_command_buffer;

-		m_current_command_buffer->flags |= cb_has_occlusion_task;
+		m_current_command_buffer->flags |= vk::command_buffer::cb_has_occlusion_task;
 	}

 	// Apply write memory barriers
@ -1796,7 +1796,6 @@ void VKGSRender::end()
 		m_occlusion_query_pool.end_query(*m_current_command_buffer, occlusion_id);
 	}

-	m_current_command_buffer->num_draws++;
 	m_rtts.on_write();

 	rsx::thread::end();
@ -2187,7 +2186,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint)
 {
 	if (hint == rsx::FIFO_hint::hint_conditional_render_eval)
 	{
-		if (m_current_command_buffer->flags & cb_has_occlusion_task)
+		if (m_current_command_buffer->flags & vk::command_buffer::cb_has_occlusion_task)
 		{
 			// Occlusion test result evaluation is coming up, avoid a hard sync
 			if (!m_flush_requests.pending())
@ -2881,7 +2880,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)

 			const utils::address_range rsx_range = m_surface_info[i].get_memory_range();
 			m_texture_cache.set_memory_read_flags(rsx_range, rsx::memory_read_flags::flush_once);
-			m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, rsx_range, m_swapchain->get_graphics_queue());
+			m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, rsx_range);
 		}

 		m_surface_info[i].address = m_surface_info[i].pitch = 0;
@ -2898,7 +2897,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 			auto old_format = vk::get_compatible_depth_surface_format(m_device->get_formats_support(), m_depth_surface_info.depth_format);
 			const utils::address_range surface_range = m_depth_surface_info.get_memory_range();
 			m_texture_cache.set_memory_read_flags(surface_range, rsx::memory_read_flags::flush_once);
-			m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, surface_range, m_swapchain->get_graphics_queue());
+			m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, surface_range);
 		}

 		m_depth_surface_info.address = m_depth_surface_info.pitch = 0;
@ -2944,6 +2943,12 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 		m_texture_cache.notify_surface_changed(layout.zeta_address);
 	}

+	// Before messing with memory properties, flush command queue if there are dma transfers queued up
+	if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
+	{
+		flush_command_queue();
+	}
+
 	const auto color_fmt_info = vk::get_compatible_gcm_format(layout.color_format);
 	for (u8 index : m_draw_buffers)
 	{
@ -2953,11 +2958,11 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 		if (g_cfg.video.write_color_buffers)
 		{
 			m_texture_cache.lock_memory_region(*m_current_command_buffer, std::get<1>(m_rtts.m_bound_render_targets[index]), surface_range,
-				m_surface_info[index].width, m_surface_info[index].height, layout.actual_color_pitch[index], std::tuple<VkQueue>{ m_swapchain->get_graphics_queue() }, color_fmt_info.first, color_fmt_info.second);
+				m_surface_info[index].width, m_surface_info[index].height, layout.actual_color_pitch[index], color_fmt_info.first, color_fmt_info.second);
 		}
 		else
 		{
-			m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range, m_swapchain->get_graphics_queue());
+			m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range);
 		}
 	}

@ -2968,11 +2973,11 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 		{
 			const u32 gcm_format = (m_depth_surface_info.depth_format != rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
 			m_texture_cache.lock_memory_region(*m_current_command_buffer, std::get<1>(m_rtts.m_bound_depth_stencil), surface_range,
-				m_depth_surface_info.width, m_depth_surface_info.height, layout.actual_zeta_pitch, std::tuple<VkQueue>{ m_swapchain->get_graphics_queue() }, gcm_format, false);
+				m_depth_surface_info.width, m_depth_surface_info.height, layout.actual_zeta_pitch, gcm_format, false);
 		}
 		else
 		{
-			m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range, m_swapchain->get_graphics_queue());
+			m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range);
 		}
 	}

@ -3323,21 +3328,22 @@ void VKGSRender::flip(int buffer)
 			const auto range = utils::address_range::start_length(absolute_address, buffer_pitch * buffer_height);
 			const u32  lookup_mask = rsx::texture_upload_context::blit_engine_dst | rsx::texture_upload_context::framebuffer_storage;
 			const auto overlap = m_texture_cache.find_texture_from_range<true>(range, 0, lookup_mask);
-			bool flush_queue = false;

 			for (const auto & section : overlap)
 			{
-				section->copy_texture(*m_current_command_buffer, false, m_swapchain->get_graphics_queue());
-				flush_queue = true;
+				if (!section->is_synchronized())
+				{
+					section->copy_texture(*m_current_command_buffer, true);
+				}
 			}

-			if (flush_queue)
+			if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
 			{
 				// Submit for processing to lower hard fault penalty
 				flush_command_queue();
 			}

-			m_texture_cache.invalidate_range(*m_current_command_buffer, range, rsx::invalidation_cause::read, m_swapchain->get_graphics_queue());
+			m_texture_cache.invalidate_range(*m_current_command_buffer, range, rsx::invalidation_cause::read);
 			image_to_flip = m_texture_cache.upload_image_simple(*m_current_command_buffer, absolute_address, buffer_width, buffer_height);
 		}
 	}
@ -3487,16 +3493,15 @@ bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst
 	//Verify enough memory exists before attempting to handle data transfer
 	check_heap_status();

-	const auto old_speculations_count = m_texture_cache.get_num_cache_speculative_writes();
 	if (m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer))
 	{
 		m_samplers_dirty.store(true);
-		m_current_command_buffer->flags |= cb_has_blit_transfer;
+		m_current_command_buffer->set_flag(vk::command_buffer::cb_has_blit_transfer);

-		if (m_texture_cache.get_num_cache_speculative_writes() > old_speculations_count)
+		if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
 		{
-			// A speculative write happened, flush while the dma resource is valid
-			// TODO: Deeper investigation as to why this can trigger problems
+			// A dma transfer has been queued onto this cb
+			// This likely means that we're done with the tranfers to the target (writes_likely_completed=1)
 			flush_command_queue();
 		}
 		return true;
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@ -48,20 +48,11 @@ namespace vk

 extern u64 get_system_time();

-enum command_buffer_data_flag
-{
-	cb_has_occlusion_task = 1,
-	cb_has_blit_transfer = 2
-};
-
 struct command_buffer_chunk: public vk::command_buffer
 {
 	VkFence submit_fence = VK_NULL_HANDLE;
 	VkDevice m_device = VK_NULL_HANDLE;

-	u32 num_draws = 0;
-	u32 flags = 0;
-
 	std::atomic_bool pending = { false };
 	std::atomic<u64> last_sync = { 0 };
 	shared_mutex guard_mutex;
@ -100,8 +91,6 @@ struct command_buffer_chunk: public vk::command_buffer
 			wait(FRAME_PRESENT_TIMEOUT);

 		CHECK_RESULT(vkResetCommandBuffer(commands, 0));
-		num_draws = 0;
-		flags = 0;
 	}

 	bool poke()
--- a/rpcs3/Emu/RSX/VK/VKHelpers.h
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.h
@ -1168,6 +1168,14 @@ namespace vk
 		}
 		access_hint = flush_only;

+		enum command_buffer_data_flag : u32
+		{
+			cb_has_occlusion_task = 1,
+			cb_has_blit_transfer = 2,
+			cb_has_dma_transfer = 4
+		};
+		u32 flags = 0;
+
 	public:
 		command_buffer() {}
 		~command_buffer() {}
@ -1206,6 +1214,16 @@ namespace vk
 			return *pool;
 		}

+		void clear_flags()
+		{
+			flags = 0;
+		}
+
+		void set_flag(command_buffer_data_flag flag)
+		{
+			flags |= flag;
+		}
+
 		operator VkCommandBuffer() const
 		{
 			return commands;
@ -1278,6 +1296,8 @@ namespace vk
 			acquire_global_submit_lock();
 			CHECK_RESULT(vkQueueSubmit(queue, 1, &infos, fence));
 			release_global_submit_lock();
+
+			clear_flags();
 		}
 	};

--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@ -66,11 +66,21 @@ namespace vk
 				managed_texture.reset(vram_texture);
 			}

-			//Even if we are managing the same vram section, we cannot guarantee contents are static
-			//The create method is only invoked when a new managed session is required
-			synchronized = false;
-			flushed = false;
-			sync_timestamp = 0ull;
+			if (synchronized)
+			{
+				// Even if we are managing the same vram section, we cannot guarantee contents are static
+				// The create method is only invoked when a new managed session is required
+				if (!flushed)
+				{
+					// Reset fence
+					verify(HERE), m_device, dma_buffer, dma_fence != VK_NULL_HANDLE;
+					vkResetEvent(*m_device, dma_fence);
+				}
+
+				synchronized = false;
+				flushed = false;
+				sync_timestamp = 0ull;
+			}

 			// Notify baseclass
 			baseclass::on_section_resources_created();
@ -148,14 +158,18 @@ namespace vk
 			return flushed;
 		}

-		void copy_texture(vk::command_buffer& cmd, bool manage_cb_lifetime, VkQueue submit_queue)
+		void copy_texture(vk::command_buffer& cmd, bool miss)
 		{
 			ASSERT(exists());

-			if (!manage_cb_lifetime)
+			if (LIKELY(!miss))
 			{
 				baseclass::on_speculative_flush();
 			}
+			else
+			{
+				baseclass::on_miss();
+			}

 			if (m_device == nullptr)
 			{
@ -175,11 +189,6 @@ namespace vk
 				dma_buffer.reset(new vk::buffer(*m_device, align(get_section_size(), 256), memory_type, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0));
 			}

-			if (manage_cb_lifetime)
-			{
-				cmd.begin();
-			}
-
 			if (context == rsx::texture_upload_context::framebuffer_storage)
 			{
 				auto as_rtt = static_cast<vk::render_target*>(vram_texture);
@ -295,36 +304,20 @@ namespace vk
 				vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, &copy);
 			}

-			if (manage_cb_lifetime)
+			if (LIKELY(!miss))
 			{
-				VkFence submit_fence;
-				VkFenceCreateInfo create_info{};
-				create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-				vkCreateFence(*m_device, &create_info, nullptr, &submit_fence);
-
-				cmd.end();
-				cmd.submit(submit_queue, {}, submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
-
-				// Now we need to restart the command-buffer to restore it to the way it was before...
-				vk::wait_for_fence(submit_fence);
-				CHECK_RESULT(vkResetCommandBuffer(cmd, 0));
-
-				// Cleanup
-				vkDestroyFence(*m_device, submit_fence, nullptr);
-				vkSetEvent(*m_device, dma_fence);
-				if (cmd.access_hint != vk::command_buffer::access_type_hint::all)
-				{
-					// If this is a primary CB, restart it
-					cmd.begin();
-				}
+				// If this is speculated, it should only occur once
+				verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET;
 			}
 			else
 			{
-				// Only used when doing speculation
-				verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET;
-				vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT);
+				// This is the only acceptable situation where a sync can occur twice, due to flush_always being set
+				vkResetEvent(*m_device, dma_fence);
 			}

+			cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
+			vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT);
+
 			synchronized = true;
 			sync_timestamp = get_system_time();
 		}
@ -332,19 +325,6 @@ namespace vk
 		/**
 		 * Flush
 		 */
-		void synchronize(bool blocking, vk::command_buffer& cmd, VkQueue submit_queue)
-		{
-			if (synchronized)
-				return;
-
-			if (m_device == nullptr)
-			{
-				m_device = &cmd.get_command_pool().get_owner();
-			}
-
-			copy_texture(cmd, blocking, submit_queue);
-		}
-
 		void* map_synchronized(u32 offset, u32 size)
 		{
 			AUDIT(synchronized);
@ -1104,6 +1084,44 @@ namespace vk
 			}
 		}

+		void prepare_for_dma_transfers(vk::command_buffer& cmd) override
+		{
+			if (!cmd.is_recording())
+			{
+				cmd.begin();
+			}
+		}
+
+		void cleanup_after_dma_transfers(vk::command_buffer& cmd) override
+		{
+			// End recording
+			cmd.end();
+
+			if (cmd.access_hint != vk::command_buffer::access_type_hint::all)
+			{
+				// Primary access command queue, must restart it after
+				VkFence submit_fence;
+				VkFenceCreateInfo info{};
+				info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+				vkCreateFence(*m_device, &info, nullptr, &submit_fence);
+
+				cmd.submit(m_submit_queue, {}, submit_fence, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+
+				vk::wait_for_fence(submit_fence, GENERAL_WAIT_TIMEOUT);
+				vkDestroyFence(*m_device, submit_fence, nullptr);
+
+				CHECK_RESULT(vkResetCommandBuffer(cmd, 0));
+				cmd.begin();
+			}
+			else
+			{
+				// Auxilliary command queue with auto-restart capability
+				cmd.submit(m_submit_queue, {}, VK_NULL_HANDLE, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+			}
+
+			verify(HERE), cmd.flags == 0;
+		}
+
 	public:
 		using baseclass::texture_cache;

@ -1181,12 +1199,6 @@ namespace vk
 			baseclass::on_frame_end();
 		}

-		template<typename RsxTextureType>
-		sampled_image_descriptor _upload_texture(vk::command_buffer& cmd, RsxTextureType& tex, rsx::vk_render_targets& m_rtts)
-		{
-			return upload_texture(cmd, tex, m_rtts, const_cast<const VkQueue>(m_submit_queue));
-		}
-
 		vk::image *upload_image_simple(vk::command_buffer& cmd, u32 address, u32 width, u32 height)
 		{
 			if (!m_formats_support.bgra8_linear)
@ -1243,13 +1255,13 @@ namespace vk
 		bool blit(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, rsx::vk_render_targets& m_rtts, vk::command_buffer& cmd)
 		{
 			blitter helper;
-			auto reply = upload_scaled_image(src, dst, interpolate, cmd, m_rtts, helper, const_cast<const VkQueue>(m_submit_queue));
+			auto reply = upload_scaled_image(src, dst, interpolate, cmd, m_rtts, helper);

 			if (reply.succeeded)
 			{
 				if (reply.real_dst_size)
 				{
-					flush_if_cache_miss_likely(cmd, reply.to_address_range(), m_submit_queue);
+					flush_if_cache_miss_likely(cmd, reply.to_address_range());
 				}

 				return true;