Implement spu_runtime::reset

To handle JIT: Out Of Memory error.
2025-01-31 12:31:45 +01:00 · 2019-03-18 23:01:16 +03:00 · 2019-03-18 23:01:16 +03:00 · 4b381fbbb1
commit 4b381fbbb1
parent 1880a17f79
7 changed files with 332 additions and 92 deletions
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -64,6 +64,8 @@ static u8* add_jit_memory(std::size_t size, uint align)

 		if (UNLIKELY(_new > 0x40000000))
 		{
+			// Sorry, we failed, and further attempts should fail too.
+			ctr = 0x40000000;
 			return -1;
 		}

@ -77,7 +79,7 @@ static u8* add_jit_memory(std::size_t size, uint align)

 	if (UNLIKELY(pos == -1))
 	{
-		LOG_FATAL(GENERAL, "JIT: Out of memory (size=0x%x, align=0x%x, off=0x%x)", size, align, Off);
+		LOG_WARNING(GENERAL, "JIT: Out of memory (size=0x%x, align=0x%x, off=0x%x)", size, align, Off);
 		return nullptr;
 	}

@ -181,10 +183,10 @@ void jit_runtime::finalize() noexcept
 	std::memcpy(alloc(s_data_init.size(), 1, false), s_data_init.data(), s_data_init.size());
 }

-::jit_runtime& asmjit::get_global_runtime()
+asmjit::JitRuntime& asmjit::get_global_runtime()
 {
 	// Magic static
-	static ::jit_runtime g_rt;
+	static asmjit::JitRuntime g_rt;
 	return g_rt;
 }

--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -40,7 +40,7 @@ struct jit_runtime final : asmjit::HostRuntime
 namespace asmjit
 {
 	// Should only be used to build global functions
-	::jit_runtime& get_global_runtime();
+	asmjit::JitRuntime& get_global_runtime();

 	// Emit xbegin and adjacent loop, return label at xbegin
 	Label build_transaction_enter(X86Assembler& c, Label fallback);
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -46,35 +46,20 @@ void spu_recompiler::init()
 	}
 }

-spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
+bool spu_recompiler::compile(u64 last_reset_count, const std::vector<u32>& func)
 {
-	init();
+	const auto fn_location = m_spurt->find(last_reset_count, func);

-	std::unique_lock lock(m_spurt->m_mutex);
-
-	// Try to find existing function, register new one if necessary
-	const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
-
-	auto& fn_location = fn_info.first->second;
-
-	if (!fn_location && !fn_info.second)
+	if (fn_location == spu_runtime::g_dispatcher)
 	{
-		// Wait if already in progress
-		while (!fn_location)
-		{
-			m_spurt->m_cond.wait(lock);
-		}
+		return true;
 	}

-	if (fn_location)
+	if (!fn_location)
 	{
-		return fn_location;
+		return false;
 	}

-	auto& func = fn_info.first->first;
-
-	lock.unlock();
-
 	using namespace asmjit;

 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
@ -833,12 +818,20 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	// Compile and get function address
 	spu_function_t fn;

-	if (m_asmrt.add(&fn, &code))
+	if (auto err = m_asmrt.add(&fn, &code))
 	{
+		if (err == asmjit::ErrorCode::kErrorNoVirtualMemory)
+		{
+			return false;
+		}
+
 		LOG_FATAL(SPU, "Failed to build a function");
 	}

-	m_spurt->add(*fn_info.first, fn);
+	if (!m_spurt->add(last_reset_count, fn_location, fn))
+	{
+		return false;
+	}

 	if (g_cfg.core.spu_debug)
 	{
@ -848,7 +841,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		log += "\n\n\n";

 		// Append log file
-		fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
+		fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
 	}

 	if (m_cache && g_cfg.core.spu_cache)
@ -856,7 +849,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_cache->add(func);
 	}

-	return fn;
+	return true;
 }

 spu_recompiler::XmmLink spu_recompiler::XmmAlloc() // get empty xmm register
@ -947,11 +940,21 @@ void spu_recompiler::branch_fixed(u32 target)
 		return;
 	}

+	const auto ppptr = m_spurt->make_branch_patchpoint(target);
+
 	c->mov(SPU_OFF_32(pc), target);
 	c->xor_(qw0->r32(), qw0->r32());
 	c->cmp(SPU_OFF_32(state), 0);
 	c->jnz(label_stop);
-	c->jmp(imm_ptr(m_spurt->make_branch_patchpoint(target)));
+
+	if (ppptr)
+	{
+		c->jmp(imm_ptr(ppptr));
+	}
+	else
+	{
+		c->ret();
+	}
 }

 void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -13,7 +13,7 @@ public:

 	virtual void init() override;

-	virtual spu_function_t compile(std::vector<u32>&&) override;
+	virtual bool compile(u64 last_reset_count, const std::vector<u32>&) override;

 private:
 	// ASMJIT runtime
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -23,6 +23,10 @@ const spu_decoder<spu_iname> s_spu_iname;

 extern u64 get_timebased_time();

+thread_local DECLARE(spu_runtime::workload){};
+
+thread_local DECLARE(spu_runtime::addrv){u32{0}};
+
 DECLARE(spu_runtime::tr_dispatch) = []
 {
 	// Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction
@ -149,6 +153,7 @@ void spu_cache::initialize()
 	// Read cache
 	auto func_list = cache->get();
 	atomic_t<std::size_t> fnext{};
+	atomic_t<u8> fail_flag{0};

 	// Initialize compiler instances for parallel compilation
 	u32 max_threads = static_cast<u32>(g_cfg.core.llvm_threads);
@ -190,6 +195,9 @@ void spu_cache::initialize()

 	for (std::size_t i = 0; i < compilers.size(); i++) thread_queue.emplace_back("Worker " + std::to_string(i), [&, compiler = compilers[i].get()]()
 	{
+		// Register SPU runtime user
+		spu_runtime::passive_lock _passive_lock(compiler->get_runtime());
+
 		// Fake LS
 		std::vector<be_t<u32>> ls(0x10000);

@ -198,7 +206,7 @@ void spu_cache::initialize()
 		{
 			std::vector<u32>& func = func_list[func_i];

-			if (Emu.IsStopped())
+			if (Emu.IsStopped() || fail_flag)
 			{
 				g_progr_pdone++;
 				continue;
@ -222,7 +230,11 @@ void spu_cache::initialize()
 				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
 			}

-			compiler->compile(std::move(func));
+			if (!compiler->compile(0, func))
+			{
+				// Likely, out of JIT memory. Signal to prevent further building.
+				fail_flag |= 1;
+			}

 			// Clear fake LS
 			for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4)
@ -256,6 +268,14 @@ void spu_cache::initialize()
 		return;
 	}

+	if (fail_flag)
+	{
+		LOG_ERROR(SPU, "SPU Runtime: Cache building failed (too much data). SPU Cache will be disabled.");
+		spu_runtime::passive_lock _passive_lock(compilers[0]->get_runtime());
+		compilers[0]->get_runtime().reset(0);
+		return;
+	}
+
 	if (compilers.size() && !func_list.empty())
 	{
 		LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
@ -288,9 +308,18 @@ spu_runtime::spu_runtime()
 	LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized...");
 }

-void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled)
+bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compiled)
 {
-	std::unique_lock lock(m_mutex);
+	writer_lock lock(*this);
+
+	// Check reset count (makes where invalid)
+	if (!_where || last_reset_count != m_reset_count)
+	{
+		return false;
+	}
+
+	// Use opaque pointer
+	auto& where = *static_cast<decltype(m_map)::value_type*>(_where);

 	// Function info
 	const std::vector<u32>& func = where.first;
@ -315,7 +344,12 @@ void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where,
 	else
 	{
 		// Allocate some writable executable memory
-		u8* const wxptr = verify(HERE, jit_runtime::alloc(size0 * 20, 16));
+		u8* const wxptr = jit_runtime::alloc(size0 * 20, 16);
+
+		if (!wxptr)
+		{
+			return false;
+		}

 		// Raw assembly pointer
 		u8* raw = wxptr;
@ -547,13 +581,63 @@ void spu_runtime::add(std::pair<const std::vector<u32>, spu_function_t>& where,
 		g_dispatcher[func[0] / 4] = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
 	}

-	lock.unlock();
-	m_cond.notify_all();
+	// Notify in lock destructor
+	lock.notify = true;
+	return true;
 }

-spu_function_t spu_runtime::find(const se_t<u32, false>* ls, u32 addr)
+void* spu_runtime::find(u64 last_reset_count, const std::vector<u32>& func)
 {
-	std::unique_lock lock(m_mutex);
+	writer_lock lock(*this);
+
+	// Check reset count
+	if (last_reset_count != m_reset_count)
+	{
+		return nullptr;
+	}
+
+	// Try to find existing function, register new one if necessary
+	const auto result = m_map.try_emplace(func, nullptr);
+
+	// Pointer to the value in the map (pair)
+	const auto fn_location = &*result.first;
+
+	if (fn_location->second)
+	{
+		// Already compiled
+		return g_dispatcher;
+	}
+	else if (!result.second)
+	{
+		// Wait if already in progress
+		while (!fn_location->second)
+		{
+			m_cond.wait(m_mutex);
+
+			// If reset count changed, fn_location is invalidated; also requires return
+			if (last_reset_count != m_reset_count)
+			{
+				return nullptr;
+			}
+		}
+
+		return g_dispatcher;
+	}
+
+	// Return location to compile and use in add()
+	return fn_location;
+}
+
+spu_function_t spu_runtime::find(const se_t<u32, false>* ls, u32 addr) const
+{
+	const u64 reset_count = m_reset_count;
+
+	reader_lock lock(*this);
+
+	if (reset_count != m_reset_count)
+	{
+		return nullptr;
+	}

 	const u32 start = addr * (g_cfg.core.spu_block_size != spu_block_size_type::giga);

@ -591,6 +675,11 @@ spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const
 {
 	u8* const raw = jit_runtime::alloc(16, 16);

+	if (!raw)
+	{
+		return nullptr;
+	}
+
 	// Save address of the following jmp
 #ifdef _WIN32
 	raw[0] = 0x4c; // lea r8, [rip+1]
@ -621,13 +710,50 @@ spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const
 	return reinterpret_cast<spu_function_t>(raw);
 }

-void spu_runtime::handle_return(cpu_thread* _thr)
+u64 spu_runtime::reset(std::size_t last_reset_count)
+{
+	writer_lock lock(*this);
+
+	if (last_reset_count != m_reset_count || !m_reset_count.compare_and_swap_test(last_reset_count, last_reset_count + 1))
+	{
+		// Probably already reset
+		return m_reset_count;
+	}
+
+	// Notify SPU threads
+	idm::select<named_thread<spu_thread>>([](u32, cpu_thread& cpu)
+	{
+		if (!cpu.state.test_and_set(cpu_flag::jit_return))
+		{
+			cpu.notify();
+		}
+	});
+
+	// Reset function map (may take some time)
+	m_map.clear();
+
+	// Wait for threads to catch on jit_return flag
+	while (m_passive_locks)
+	{
+		busy_wait();
+	}
+
+	// Reinitialize (TODO)
+	jit_runtime::finalize();
+	jit_runtime::initialize();
+	return ++m_reset_count;
+}
+
+void spu_runtime::handle_return(spu_thread* _spu)
 {
 	// Wait until the runtime becomes available
-	//writer_lock lock(*this);
+	writer_lock lock(*this);

-	// Simply reset the flag
-	_thr->state -= cpu_flag::jit_return;
+	// Reset stack mirror
+	std::memset(_spu->stack_mirror.data(), 0xff, sizeof(spu_thread::stack_mirror));
+
+	// Reset the flag
+	_spu->state -= cpu_flag::jit_return;
 }

 spu_recompiler_base::spu_recompiler_base()
@ -638,6 +764,19 @@ spu_recompiler_base::~spu_recompiler_base()
 {
 }

+void spu_recompiler_base::make_function(const std::vector<u32>& data)
+{
+	for (u64 reset_count = m_spurt->get_reset_count();;)
+	{
+		if (LIKELY(compile(reset_count, data)))
+		{
+			break;
+		}
+
+		reset_count = m_spurt->reset(reset_count);
+	}
+}
+
 void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 {
 	// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
@ -669,7 +808,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
 	}

 	// Compile
-	verify(HERE), spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc));
+	spu.jit->make_function(spu.jit->block(spu._ptr<u32>(0), spu.pc));

 	// Diagnostic
 	if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
@ -2097,11 +2236,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator

 			// Generate a patchpoint for fixed location
 			const auto cblock = m_ir->GetInsertBlock();
+			const auto ppptr  = m_spurt->make_branch_patchpoint(target);
 			const auto result = llvm::BasicBlock::Create(m_context, "", m_function);
 			m_ir->SetInsertPoint(result);
 			m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&spu_thread::pc));
 			const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u8*>(), get_type<u8*>(), get_type<u32>()}, false)->getPointerTo();
-			tail(m_ir->CreateIntToPtr(m_ir->getInt64((u64)m_spurt->make_branch_patchpoint(target)), type));
+			tail(m_ir->CreateIntToPtr(m_ir->getInt64(reinterpret_cast<u64>(ppptr ? ppptr : &spu_recompiler_base::dispatch)), type));
 			m_ir->SetInsertPoint(cblock);
 			return result;
 		}
@ -2652,36 +2792,20 @@ public:
 		}
 	}

-	virtual spu_function_t compile(std::vector<u32>&& func_rv) override
+	virtual bool compile(u64 last_reset_count, const std::vector<u32>& func) override
 	{
-		init();
+		const auto fn_location = m_spurt->find(last_reset_count, func);

-		// Don't lock without shared runtime
-		std::unique_lock lock(m_spurt->m_mutex);
-
-		// Try to find existing function, register new one if necessary
-		const auto fn_info = m_spurt->m_map.emplace(std::move(func_rv), nullptr);
-
-		auto& fn_location = fn_info.first->second;
-
-		if (!fn_location && !fn_info.second)
+		if (fn_location == spu_runtime::g_dispatcher)
 		{
-			// Wait if already in progress
-			while (!fn_location)
-			{
-				m_spurt->m_cond.wait(lock);
-			}
+			return true;
 		}

-		if (fn_location)
+		if (!fn_location)
 		{
-			return fn_location;
+			return false;
 		}

-		auto& func = fn_info.first->first;
-
-		lock.unlock();
-
 		std::string hash;
 		{
 			sha1_context ctx;
@ -2744,12 +2868,7 @@ public:

 			log += '\n';
 			this->dump(log);
-			fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
-		}
-
-		if (m_cache && g_cfg.core.spu_cache)
-		{
-			m_cache->add(func);
+			fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
 		}

 		using namespace llvm;
@ -3181,7 +3300,7 @@ public:

 			if (g_cfg.core.spu_debug)
 			{
-				fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
+				fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
 			}

 			fmt::raw_error("Compilation failed");
@ -3190,7 +3309,7 @@ public:
 		if (g_cfg.core.spu_debug)
 		{
 			// Testing only
-			m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
+			m_jit.add(std::move(module), m_spurt->get_cache_path() + "llvm/");
 		}
 		else
 		{
@ -3202,15 +3321,23 @@ public:
 		// Register function pointer
 		const spu_function_t fn = reinterpret_cast<spu_function_t>(m_jit.get_engine().getPointerToFunction(main_func));

-		m_spurt->add(*fn_info.first, fn);
+		if (!m_spurt->add(last_reset_count, fn_location, fn))
+		{
+			return false;
+		}

 		if (g_cfg.core.spu_debug)
 		{
 			out.flush();
-			fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
+			fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log);
 		}

-		return fn;
+		if (m_cache && g_cfg.core.spu_cache)
+		{
+			m_cache->add(func);
+		}
+
+		return true;
 	}

 	static bool exec_check_state(spu_thread* _spu)
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -36,10 +36,13 @@ public:
 // Helper class
 class spu_runtime
 {
-public:
-	shared_mutex m_mutex;
+	mutable shared_mutex m_mutex;

-	cond_variable m_cond;
+	mutable cond_variable m_cond;
+
+	mutable atomic_t<u64> m_passive_locks{0};
+
+	atomic_t<u64> m_reset_count{0};

 	// All functions
 	std::map<std::vector<u32>, spu_function_t> m_map;
@ -57,12 +60,12 @@ public:
 		std::map<std::vector<u32>, spu_function_t>::iterator beg;
 		std::map<std::vector<u32>, spu_function_t>::iterator end;
 	};
-private:
-	// Scratch vector
-	std::vector<work> workload;

 	// Scratch vector
-	std::vector<u32> addrv{u32{0}};
+	static thread_local std::vector<work> workload;
+
+	// Scratch vector
+	static thread_local std::vector<u32> addrv;

 	// Trampoline to spu_recompiler_base::dispatch
 	static const spu_function_t tr_dispatch;
@ -73,20 +76,104 @@ private:
 public:
 	spu_runtime();

+	const std::string& get_cache_path() const
+	{
+		return m_cache_path;
+	}
+
 	// Add compiled function and generate trampoline if necessary
-	void add(std::pair<const std::vector<u32>, spu_function_t>& where, spu_function_t compiled);
+	bool add(u64 last_reset_count, void* where, spu_function_t compiled);
+
+	// Return opaque pointer for add()
+	void* find(u64 last_reset_count, const std::vector<u32>&);

 	// Find existing function
-	spu_function_t find(const se_t<u32, false>* ls, u32 addr);
+	spu_function_t find(const se_t<u32, false>* ls, u32 addr) const;

 	// Generate a patchable trampoline to spu_recompiler_base::branch
 	spu_function_t make_branch_patchpoint(u32 target) const;

+	// reset() arg retriever, for race avoidance (can result in double reset)
+	u64 get_reset_count() const
+	{
+		return m_reset_count.load();
+	}
+
+	// Remove all compiled function and free JIT memory
+	u64 reset(std::size_t last_reset_count);
+
 	// Handle cpu_flag::jit_return
-	void handle_return(cpu_thread* _thr);
+	void handle_return(spu_thread* _spu);

 	// All dispatchers (array allocated in jit memory)
 	static atomic_t<spu_function_t>* const g_dispatcher;
+
+	struct passive_lock
+	{
+		spu_runtime& _this;
+
+		passive_lock(const passive_lock&) = delete;
+
+		passive_lock(spu_runtime& _this)
+			: _this(_this)
+		{
+			std::lock_guard lock(_this.m_mutex);
+			_this.m_passive_locks++;
+		}
+
+		~passive_lock()
+		{
+			_this.m_passive_locks--;
+		}
+	};
+
+	// Exclusive lock within passive_lock scope
+	struct writer_lock
+	{
+		spu_runtime& _this;
+		bool notify = false;
+
+		writer_lock(const writer_lock&) = delete;
+
+		writer_lock(spu_runtime& _this)
+			: _this(_this)
+		{
+			// Temporarily release the passive lock
+			_this.m_passive_locks--;
+			_this.m_mutex.lock();
+		}
+
+		~writer_lock()
+		{
+			_this.m_passive_locks++;
+			_this.m_mutex.unlock();
+
+			if (notify)
+			{
+				_this.m_cond.notify_all();
+			}
+		}
+	};
+
+	struct reader_lock
+	{
+		const spu_runtime& _this;
+
+		reader_lock(const reader_lock&) = delete;
+
+		reader_lock(const spu_runtime& _this)
+			: _this(_this)
+		{
+			_this.m_passive_locks--;
+			_this.m_mutex.lock_shared();
+		}
+
+		~reader_lock()
+		{
+			_this.m_passive_locks++;
+			_this.m_mutex.unlock_shared();
+		}
+	};
 };

 // SPU Recompiler instance base class
@ -130,8 +217,11 @@ public:
 	// Initialize
 	virtual void init() = 0;

-	// Compile function
-	virtual spu_function_t compile(std::vector<u32>&&) = 0;
+	// Compile function (may fail)
+	virtual bool compile(u64 last_reset_count, const std::vector<u32>&) = 0;
+
+	// Compile function, handle failure
+	void make_function(const std::vector<u32>&);

 	// Default dispatch function fallback (second arg is unused)
 	static void dispatch(spu_thread&, void*, u8* rip);
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -568,8 +568,26 @@ void spu_thread::cpu_task()

 	if (jit)
 	{
-		while (LIKELY(!state || !check_state()))
+		// Register SPU runtime user
+		spu_runtime::passive_lock _passive_lock(jit->get_runtime());
+
+		while (true)
 		{
+			if (UNLIKELY(state))
+			{
+				if (check_state())
+				{
+					if (state & cpu_flag::jit_return)
+					{
+						// Handle jit_return as a special case
+						jit->get_runtime().handle_return(this);
+						continue;
+					}
+
+					break;
+				}
+			}
+
 			spu_runtime::g_dispatcher[pc / 4](*this, vm::_ptr<u8>(offset), nullptr);
 		}