diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp
index 9cf80dbb0b..419a02b45c 100644
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@@ -190,18 +190,34 @@ asmjit::JitRuntime& asmjit::get_global_runtime()
 	return g_rt;
 }
 
-asmjit::Label asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback)
+void asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, const asmjit::X86Gp& ctr, uint less_than)
 {
 	Label fall = c.newLabel();
 	Label begin = c.newLabel();
 	c.jmp(begin);
 	c.bind(fall);
-	c.test(x86::eax, _XABORT_RETRY);
-	c.jz(fallback);
+
+	if (less_than < 65)
+	{
+		c.add(ctr, 1);
+		c.test(x86::eax, _XABORT_RETRY);
+		c.jz(fallback);
+	}
+	else
+	{
+		// Count an attempt without RETRY flag as 65 normal attempts and continue
+		c.not_(x86::eax);
+		c.and_(x86::eax, _XABORT_RETRY);
+		c.shl(x86::eax, 5);
+		c.add(x86::eax, 1); // eax = RETRY ? 1 : 65
+		c.add(ctr, x86::rax);
+	}
+
+	c.cmp(ctr, less_than);
+	c.jae(fallback);
 	c.align(kAlignCode, 16);
 	c.bind(begin);
 	c.xbegin(fall);
-	return begin;
 }
 
 void asmjit::build_transaction_abort(asmjit::X86Assembler& c, unsigned char code)
diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index d3028ce47e..ac658c7cbb 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -43,7 +43,7 @@ namespace asmjit
 	asmjit::JitRuntime& get_global_runtime();
 
 	// Emit xbegin and adjacent loop, return label at xbegin
-	Label build_transaction_enter(X86Assembler& c, Label fallback);
+	void build_transaction_enter(X86Assembler& c, Label fallback, const X86Gp& ctr, uint less_than);
 
 	// Emit xabort
 	void build_transaction_abort(X86Assembler& c, unsigned char code);
diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp
index fe29ef58dc..b07bbe2323 100644
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@@ -3,6 +3,7 @@
 #include "Emu/System.h"
 #include "Emu/IdManager.h"
 #include "Emu/Cell/SPUThread.h"
+#include "Emu/Cell/PPUThread.h"
 #include "Emu/Cell/RawSPUThread.h"
 #include "Emu/Cell/lv2/sys_mmapper.h"
 #include "Emu/Cell/lv2/sys_event.h"
@@ -1101,6 +1102,11 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context)
 
 		try
 		{
+			if (cpu)
+			{
+				vm::temporary_unlock(*cpu);
+			}
+
 			handled = rsx::g_access_violation_handler(addr, is_writing);
 		}
 		catch (const std::exception& e)
@@ -1109,7 +1115,6 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context)
 
 			if (cpu)
 			{
-				vm::temporary_unlock(*cpu);
 				cpu->state += cpu_flag::dbg_pause;
 
 				if (cpu->test_stopped())
@@ -1131,6 +1136,10 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context)
 
 			return true;
 		}
+
+		if (cpu && cpu->test_stopped())
+		{
+		}
 	}
 
 	auto code = (const u8*)RIP(context);
diff --git a/Utilities/cond.cpp b/Utilities/cond.cpp
index 718ce5cb30..8a829cd282 100644
--- a/Utilities/cond.cpp
+++ b/Utilities/cond.cpp
@@ -273,6 +273,177 @@ void shared_cond::imp_notify() noexcept
 	balanced_awaken<true>(m_cvx32, utils::popcnt32(wait_mask));
 }
 
+void shared_cond::wait_all() noexcept
+{
+	// Try to acquire waiting state without locking but only if there are other locks
+	const auto [old_, result] = m_cvx32.fetch_op([](u64& cvx32) -> u64
+	{
+		// Check waiting alone
+		if ((cvx32 & 0xffffffff) == 0)
+		{
+			return 0;
+		}
+
+		// Combine used bits and invert to find least significant bit unused
+		const u32 slot = utils::cnttz64(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
+
+		// Set waiting bit (does nothing if all slots are used)
+		cvx32 |= (1ull << slot) & 0xffffffff;
+		return 1ull << slot;
+	});
+
+	if (!result)
+	{
+		return;
+	}
+
+	if (result > 0xffffffffu)
+	{
+		// All slots are used, fallback to spin wait
+		while (m_cvx32 & 0xffffffff)
+		{
+			busy_wait();
+		}
+
+		return;
+	}
+
+	const u64 wait_bit = result;
+	const u64 lock_bit = wait_bit | (wait_bit << 32);
+
+	balanced_wait_until(m_cvx32, -1, [&](u64& cvx32, auto... ret) -> int
+	{
+		if ((cvx32 & wait_bit) == 0)
+		{
+			// Remove signal and unlock at once
+			cvx32 &= ~lock_bit;
+			return +1;
+		}
+
+		if constexpr (sizeof...(ret))
+		{
+			cvx32 &= ~lock_bit;
+			return -1;
+		}
+
+		return 0;
+	});
+}
+
+bool shared_cond::wait_all(shared_cond::shared_lock& lock) noexcept
+{
+	AUDIT(lock.m_this == this);
+
+	if (lock.m_slot >= 32)
+	{
+		// Invalid argument, assume notified
+		return true;
+	}
+
+	const u64 wait_bit = c_wait << lock.m_slot;
+	const u64 lock_bit = c_lock << lock.m_slot;
+
+	// Try to acquire waiting state only if there are other locks
+	const auto [old_, not_alone] = m_cvx32.fetch_op([&](u64& cvx32)
+	{
+		// Check locking alone
+		if (((cvx32 >> 32) & cvx32) == (lock_bit >> 32))
+		{
+			return false;
+		}
+
+		// c_lock -> c_wait, c_sig -> unlock
+		cvx32 &= ~(lock_bit & ~wait_bit);
+		return true;
+	});
+
+	if (!not_alone)
+	{
+		return false;
+	}
+	else
+	{
+		// Set invalid slot to acknowledge unlocking
+		lock.m_slot = 33;
+	}
+
+	if ((old_ & wait_bit) == 0)
+	{
+		// Already signaled, return without waiting
+		return true;
+	}
+
+	balanced_wait_until(m_cvx32, -1, [&](u64& cvx32, auto... ret) -> int
+	{
+		if ((cvx32 & wait_bit) == 0)
+		{
+			// Remove signal and unlock at once
+			cvx32 &= ~lock_bit;
+			return +1;
+		}
+
+		if constexpr (sizeof...(ret))
+		{
+			cvx32 &= ~lock_bit;
+			return -1;
+		}
+
+		return 0;
+	});
+
+	return true;
+}
+
+bool shared_cond::notify_all(shared_cond::shared_lock& lock) noexcept
+{
+	AUDIT(lock.m_this == this);
+
+	if (lock.m_slot >= 32)
+	{
+		// Invalid argument
+		return false;
+	}
+
+	const u64 slot_mask = c_sig << lock.m_slot;
+
+	auto [old, ok] = m_cvx32.fetch_op([&](u64& cvx32)
+	{
+		if (((cvx32 << 32) & cvx32) != slot_mask)
+		{
+			return false;
+		}
+
+		if (const u64 sig_mask = cvx32 & 0xffffffff)
+		{
+			cvx32 &= (0xffffffffull << 32) & ~slot_mask;
+			cvx32 |= (sig_mask << 32) & ~slot_mask;
+			return true;
+		}
+
+		return false;
+	});
+
+	if (!ok)
+	{
+		// Not an exclusive reader
+		return false;
+	}
+
+	// Set invalid slot to acknowledge unlocking
+	lock.m_slot = 34;
+
+	// Determine if some waiters need a syscall notification
+	const u64 wait_mask = old & (~old >> 32);
+
+	if (UNLIKELY(!wait_mask))
+	{
+		return true;
+	}
+
+	balanced_awaken<true>(m_cvx32, utils::popcnt32(wait_mask));
+	return true;
+}
+
 bool lf_queue_base::wait(u64 _timeout)
 {
 	auto _old = m_head.compare_and_swap(0, 1);
diff --git a/Utilities/cond.h b/Utilities/cond.h
index dc716fab88..d76bf81d8e 100644
--- a/Utilities/cond.h
+++ b/Utilities/cond.h
@@ -206,7 +206,7 @@ class shared_cond
 			m_slot = m_this->m_cvx32.atomic_op([](u64& cvx32)
 			{
 				// Combine used bits and invert to find least significant bit unused
-				const u32 slot = utils::cnttz32(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
+				const u32 slot = utils::cnttz64(~((cvx32 & 0xffffffff) | (cvx32 >> 32)), true);
 
 				// Set lock bits (does nothing if all slots are used)
 				const u64 bit = (1ull << slot) & 0xffffffff;
@@ -217,6 +217,13 @@ class shared_cond
 
 		shared_lock(const shared_lock&) = delete;
 
+		shared_lock(shared_lock&& rhs)
+			: m_this(rhs.m_this)
+			, m_slot(rhs.m_slot)
+		{
+			rhs.m_slot = 32;
+		}
+
 		shared_lock& operator=(const shared_lock&) = delete;
 
 		~shared_lock()
@@ -261,6 +268,10 @@ public:
 		return imp_wait(lock.m_slot, usec_timeout);
 	}
 
+	void wait_all() noexcept;
+
+	bool wait_all(shared_lock& lock) noexcept;
+
 	void notify_all() noexcept
 	{
 		if (LIKELY(!m_cvx32))
@@ -268,4 +279,6 @@ public:
 
 		imp_notify();
 	}
+
+	bool notify_all(shared_lock& lock) noexcept;
 };
diff --git a/llvm b/llvm
index b860b5e8f4..99b5284463 160000
--- a/llvm
+++ b/llvm
@@ -1 +1 @@
-Subproject commit b860b5e8f4ee90d6eb567d83ce8ed1a3e71e496f
+Subproject commit 99b5284463025849c59067e79a3c08899049757e
diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp
index 150a197f78..75130fe545 100644
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@@ -19,10 +19,13 @@ void fmt_class_string<cpu_flag>::format(std::string& out, u64 arg)
 		{
 		case cpu_flag::stop: return "STOP";
 		case cpu_flag::exit: return "EXIT";
+		case cpu_flag::wait: return "w";
+		case cpu_flag::pause: return "p";
 		case cpu_flag::suspend: return "s";
 		case cpu_flag::ret: return "ret";
 		case cpu_flag::signal: return "sig";
 		case cpu_flag::memory: return "mem";
+		case cpu_flag::jit_return: return "JIT";
 		case cpu_flag::dbg_global_pause: return "G-PAUSE";
 		case cpu_flag::dbg_global_stop: return "G-EXIT";
 		case cpu_flag::dbg_pause: return "PAUSE";
@@ -42,10 +45,43 @@ void fmt_class_string<bs_t<cpu_flag>>::format(std::string& out, u64 arg)
 
 thread_local cpu_thread* g_tls_current_cpu_thread = nullptr;
 
+// For coordination and notification
+alignas(64) shared_cond g_cpu_array_lock;
+
+// For cpu_flag::pause bit setting/removing
+alignas(64) shared_mutex g_cpu_pause_lock;
+
+// For cpu_flag::pause
+alignas(64) atomic_t<u64> g_cpu_pause_ctr{0};
+
+// Semaphore for global thread array (global counter)
+alignas(64) atomic_t<u32> g_cpu_array_sema{0};
+
+// Semaphore subdivision for each array slot (64 x N in total)
+atomic_t<u64> g_cpu_array_bits[6]{};
+
+// All registered threads
+atomic_t<cpu_thread*> g_cpu_array[sizeof(g_cpu_array_bits) * 8]{};
+
+template <typename F>
+void for_all_cpu(F&& func) noexcept
+{
+	for (u32 i = 0; i < ::size32(g_cpu_array_bits); i++)
+	{
+		for (u64 bits = g_cpu_array_bits[i]; bits; bits &= bits - 1)
+		{
+			const u64 index = i * 64 + utils::cnttz64(bits, true);
+
+			if (cpu_thread* cpu = g_cpu_array[index].load())
+			{
+				func(cpu);
+			}
+		}
+	}
+}
+
 void cpu_thread::operator()()
 {
-	state -= cpu_flag::exit;
-
 	g_tls_current_cpu_thread = this;
 
 	if (g_cfg.core.thread_scheduler_enabled)
@@ -58,6 +94,48 @@ void cpu_thread::operator()()
 		thread_ctrl::set_native_priority(-1);
 	}
 
+	// Register thread in g_cpu_array
+	if (!g_cpu_array_sema.try_inc(sizeof(g_cpu_array_bits) * 8))
+	{
+		LOG_FATAL(GENERAL, "Too many threads");
+		Emu.Pause();
+		return;
+	}
+
+	u64 array_slot = -1;
+
+	for (u32 i = 0;; i = (i + 1) % ::size32(g_cpu_array_bits))
+	{
+		if (LIKELY(~g_cpu_array_bits[i]))
+		{
+			const u64 found = g_cpu_array_bits[i].atomic_op([](u64& bits) -> u64
+			{
+				// Find empty array slot and set its bit
+				if (LIKELY(~bits))
+				{
+					const u64 bit = utils::cnttz64(~bits, true);
+					bits |= 1ull << bit;
+					return bit;
+				}
+
+				return 64;
+			});
+
+			if (LIKELY(found < 64))
+			{
+				// Fixup
+				array_slot = i * 64 + found;
+				break;
+			}
+		}
+	}
+
+	// Register and wait if necessary
+	verify("g_cpu_array[...] -> this" HERE), g_cpu_array[array_slot].exchange(this) == nullptr;
+
+	state += cpu_flag::wait;
+	g_cpu_array_lock.wait_all();
+
 	// Check thread status
 	while (!(state & (cpu_flag::exit + cpu_flag::dbg_global_stop)))
 	{
@@ -86,6 +164,13 @@ void cpu_thread::operator()()
 
 		thread_ctrl::wait();
 	}
+
+	// Unregister and wait if necessary
+	state += cpu_flag::wait;
+	verify("g_cpu_array[...] -> null" HERE), g_cpu_array[array_slot].exchange(nullptr) == this;
+	g_cpu_array_bits[array_slot / 64] &= ~(1ull << (array_slot % 64));
+	g_cpu_array_sema--;
+	g_cpu_array_lock.wait_all();
 }
 
 void cpu_thread::on_abort()
@@ -105,7 +190,7 @@ cpu_thread::cpu_thread(u32 id)
 	g_threads_created++;
 }
 
-bool cpu_thread::check_state()
+bool cpu_thread::check_state() noexcept
 {
 #ifdef WITH_GDB_DEBUGGER
 	if (state & cpu_flag::dbg_pause)
@@ -117,6 +202,11 @@ bool cpu_thread::check_state()
 	bool cpu_sleep_called = false;
 	bool cpu_flag_memory = false;
 
+	if (!(state & cpu_flag::wait))
+	{
+		state += cpu_flag::wait;
+	}
+
 	while (true)
 	{
 		if (state & cpu_flag::memory)
@@ -131,8 +221,9 @@ bool cpu_thread::check_state()
 			state -= cpu_flag::memory;
 		}
 
-		if (state & cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop)
+		if (state & (cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop))
 		{
+			state += cpu_flag::wait;
 			return true;
 		}
 
@@ -141,7 +232,24 @@ bool cpu_thread::check_state()
 			cpu_sleep_called = false;
 		}
 
-		if (!is_paused())
+		const auto [state0, escape] = state.fetch_op([&](bs_t<cpu_flag>& flags)
+		{
+			// Check pause flags which hold thread inside check_state
+			if (flags & (cpu_flag::pause + cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause))
+			{
+				return false;
+			}
+
+			// Atomically clean wait flag and escape
+			if (!(flags & (cpu_flag::exit + cpu_flag::jit_return + cpu_flag::dbg_global_stop + cpu_flag::ret + cpu_flag::stop)))
+			{
+				flags -= cpu_flag::wait;
+			}
+
+			return true;
+		});
+
+		if (escape)
 		{
 			if (cpu_flag_memory)
 			{
@@ -150,14 +258,43 @@ bool cpu_thread::check_state()
 
 			break;
 		}
-		else if (!cpu_sleep_called && state & cpu_flag::suspend)
+		else if (!cpu_sleep_called && state0 & cpu_flag::suspend)
 		{
 			cpu_sleep();
 			cpu_sleep_called = true;
 			continue;
 		}
 
-		thread_ctrl::wait();
+		if (state & cpu_flag::wait)
+		{
+			// Spin wait once for a bit before resorting to thread_ctrl::wait
+			for (u32 i = 0; i < 10; i++)
+			{
+				if (state0 & (cpu_flag::pause + cpu_flag::suspend))
+				{
+					busy_wait(500);
+				}
+				else
+				{
+					break;
+				}
+			}
+
+			if (!(state0 & (cpu_flag::pause + cpu_flag::suspend)))
+			{
+				continue;
+			}
+		}
+
+		if (state0 & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause))
+		{
+			thread_ctrl::wait();
+		}
+		else
+		{
+			// If only cpu_flag::pause was set, notification won't arrive
+			g_cpu_array_lock.wait_all();
+		}
 	}
 
 	const auto state_ = state.load();
@@ -196,3 +333,90 @@ std::string cpu_thread::dump() const
 {
 	return fmt::format("Type: %s\n" "State: %s\n", typeid(*this).name(), state.load());
 }
+
+cpu_thread::suspend_all::suspend_all(cpu_thread* _this) noexcept
+	: m_lock(g_cpu_array_lock.try_shared_lock())
+	, m_this(_this)
+{
+	// TODO
+	if (!m_lock)
+	{
+		LOG_FATAL(GENERAL, "g_cpu_array_lock: too many concurrent accesses");
+		Emu.Pause();
+		return;
+	}
+
+	if (m_this)
+	{
+		m_this->state += cpu_flag::wait;
+	}
+
+	g_cpu_pause_ctr++;
+
+	reader_lock lock(g_cpu_pause_lock);
+
+	for_all_cpu([](cpu_thread* cpu)
+	{
+		cpu->state += cpu_flag::pause;
+	});
+
+	busy_wait(500);
+
+	while (true)
+	{
+		bool ok = true;
+
+		for_all_cpu([&](cpu_thread* cpu)
+		{
+			if (!(cpu->state & cpu_flag::wait))
+			{
+				ok = false;
+			}
+		});
+
+		if (LIKELY(ok))
+		{
+			break;
+		}
+
+		busy_wait(500);
+	}
+}
+
+cpu_thread::suspend_all::~suspend_all()
+{
+	// Make sure the latest thread does the cleanup and notifies others
+	u64 pause_ctr = 0;
+
+	while ((pause_ctr = g_cpu_pause_ctr), !g_cpu_array_lock.wait_all(m_lock))
+	{
+		if (pause_ctr)
+		{
+			std::lock_guard lock(g_cpu_pause_lock);
+
+			// Detect possible unfortunate reordering of flag clearing after suspend_all's reader lock
+			if (g_cpu_pause_ctr != pause_ctr)
+			{
+				continue;
+			}
+
+			for_all_cpu([&](cpu_thread* cpu)
+			{
+				if (g_cpu_pause_ctr == pause_ctr)
+				{
+					cpu->state -= cpu_flag::pause;
+				}
+			});
+		}
+
+		if (g_cpu_array_lock.notify_all(m_lock))
+		{
+			break;
+		}
+	}
+
+	if (m_this)
+	{
+		m_this->check_state();
+	}
+}
diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h
index 7eb3fdf633..a1f3af46e9 100644
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@@ -2,12 +2,15 @@
 
 #include "../Utilities/Thread.h"
 #include "../Utilities/bit_set.h"
+#include "../Utilities/cond.h"
 
 // Thread state flags
 enum class cpu_flag : u32
 {
 	stop, // Thread not running (HLE, initial state)
 	exit, // Irreversible exit
+	wait, // Indicates waiting state, set by the thread itself
+	pause, // Thread suspended by suspend_all technique
 	suspend, // Thread suspended
 	ret, // Callback return requested
 	signal, // Thread received a signal (HLE)
@@ -39,15 +42,15 @@ public:
 	const u32 id;
 
 	// Public thread state
-	atomic_bs_t<cpu_flag> state{+cpu_flag::stop};
+	atomic_bs_t<cpu_flag> state{cpu_flag::stop + cpu_flag::wait};
 
 	// Process thread state, return true if the checker must return
-	bool check_state();
+	bool check_state() noexcept;
 
 	// Process thread state (pause)
 	[[nodiscard]] bool test_stopped()
 	{
-		if (UNLIKELY(state))
+		if (state)
 		{
 			if (check_state())
 			{
@@ -99,6 +102,20 @@ public:
 
 	// Callback for vm::temporary_unlock
 	virtual void cpu_unmem() {}
+
+	// Thread locker
+	class suspend_all
+	{
+		decltype(std::declval<shared_cond&>().try_shared_lock()) m_lock;
+
+		cpu_thread* m_this;
+
+	public:
+		suspend_all(cpu_thread* _this) noexcept;
+		suspend_all(const suspend_all&) = delete;
+		suspend_all& operator=(const suspend_all&) = delete;
+		~suspend_all();
+	};
 };
 
 inline cpu_thread* get_current_cpu_thread() noexcept
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 48c455f822..cd8a9b7d38 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -1064,11 +1064,12 @@ const auto ppu_stwcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
 	c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
 	c.shr(args[0], 7);
 	c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3));
+	c.xor_(args[0].r32(), args[0].r32());
 	c.bswap(args[2].r32());
 	c.bswap(args[3].r32());
 
 	// Begin transaction
-	Label begin = build_transaction_enter(c, fall);
+	build_transaction_enter(c, fall, args[0], 16);
 	c.mov(x86::rax, x86::qword_ptr(x86::r10));
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, args[1]);
@@ -1184,11 +1185,12 @@ const auto ppu_stdcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
 	c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
 	c.shr(args[0], 7);
 	c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3));
+	c.xor_(args[0].r32(), args[0].r32());
 	c.bswap(args[2]);
 	c.bswap(args[3]);
 
 	// Begin transaction
-	Label begin = build_transaction_enter(c, fall);
+	build_transaction_enter(c, fall, args[0], 16);
 	c.mov(x86::rax, x86::qword_ptr(x86::r10));
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, args[1]);
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index 9f19ae77c5..ba40f4894f 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -1349,6 +1349,12 @@ void spu_stop(spu_thread* _spu, u32 code)
 	{
 		spu_runtime::g_escape(_spu);
 	}
+
+	if (_spu->test_stopped())
+	{
+		_spu->pc += 4;
+		spu_runtime::g_escape(_spu);
+	}
 }
 
 void spu_recompiler::STOP(spu_opcode_t op)
@@ -1407,7 +1413,7 @@ void spu_recompiler::MFSPR(spu_opcode_t op)
 	c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
 }
 
-static s64 spu_rdch(spu_thread* _spu, u32 ch)
+static u32 spu_rdch(spu_thread* _spu, u32 ch)
 {
 	const s64 result = _spu->get_ch_value(ch);
 
@@ -1416,7 +1422,13 @@ static s64 spu_rdch(spu_thread* _spu, u32 ch)
 		spu_runtime::g_escape(_spu);
 	}
 
-	return result;
+	if (_spu->test_stopped())
+	{
+		_spu->pc += 4;
+		spu_runtime::g_escape(_spu);
+	}
+
+	return static_cast<u32>(result & 0xffffffff);
 }
 
 void spu_recompiler::RDCH(spu_opcode_t op)
@@ -2319,14 +2331,26 @@ static void spu_wrch(spu_thread* _spu, u32 ch, u32 value)
 	{
 		spu_runtime::g_escape(_spu);
 	}
+
+	if (_spu->test_stopped())
+	{
+		_spu->pc += 4;
+		spu_runtime::g_escape(_spu);
+	}
 }
 
-static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret)
+static void spu_wrch_mfc(spu_thread* _spu)
 {
 	if (!_spu->process_mfc_cmd())
 	{
 		spu_runtime::g_escape(_spu);
 	}
+
+	if (_spu->test_stopped())
+	{
+		_spu->pc += 4;
+		spu_runtime::g_escape(_spu);
+	}
 }
 
 void spu_recompiler::WRCH(spu_opcode_t op)
diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp
index cd3cfa6301..6c90bb449f 100644
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@@ -167,6 +167,13 @@ bool spu_interpreter::RDCH(spu_thread& spu, spu_opcode_t op)
 	}
 
 	spu.gpr[op.rt] = v128::from32r(static_cast<u32>(result));
+
+	if (spu.state)
+	{
+		spu.pc += 4;
+		return false;
+	}
+
 	return true;
 }
 
@@ -414,7 +421,18 @@ bool spu_interpreter::MTSPR(spu_thread& spu, spu_opcode_t op)
 
 bool spu_interpreter::WRCH(spu_thread& spu, spu_opcode_t op)
 {
-	return spu.set_ch_value(op.ra, spu.gpr[op.rt]._u32[3]);
+	if (!spu.set_ch_value(op.ra, spu.gpr[op.rt]._u32[3]))
+	{
+		return false;
+	}
+
+	if (spu.state)
+	{
+		spu.pc += 4;
+		return false;
+	}
+
+	return true;
 }
 
 bool spu_interpreter::BIZ(spu_thread& spu, spu_opcode_t op)
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index ff6dc304e4..41b8267d6c 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -5125,34 +5125,30 @@ public:
 		call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode));
 	}
 
-	static bool exec_stop(spu_thread* _spu, u32 code)
+	static void exec_stop(spu_thread* _spu, u32 code)
 	{
-		return _spu->stop_and_signal(code);
+		if (!_spu->stop_and_signal(code))
+		{
+			spu_runtime::g_escape(_spu);
+		}
+
+		if (_spu->test_stopped())
+		{
+			_spu->pc += 4;
+			spu_runtime::g_escape(_spu);
+		}
 	}
 
 	void STOP(spu_opcode_t op) //
 	{
 		if (m_interp_magn)
 		{
-			const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff)));
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(succ, next, stop);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateRetVoid();
-			m_ir->SetInsertPoint(next);
+			call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff)));
 			return;
 		}
 
 		update_pc();
-		const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff));
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(succ, next, stop);
-		m_ir->SetInsertPoint(stop);
-		m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
-		m_ir->CreateBr(next);
-		m_ir->SetInsertPoint(next);
+		call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff));
 
 		if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
 		{
@@ -5167,28 +5163,35 @@ public:
 	{
 		if (m_interp_magn)
 		{
-			const auto succ = call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff));
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(succ, next, stop);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateRetVoid();
-			m_ir->SetInsertPoint(next);
+			call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff));
 			return;
 		}
 
 		STOP(spu_opcode_t{0x3fff});
 	}
 
-	static s64 exec_rdch(spu_thread* _spu, u32 ch)
+	static u32 exec_rdch(spu_thread* _spu, u32 ch)
 	{
-		return _spu->get_ch_value(ch);
+		const s64 result = _spu->get_ch_value(ch);
+
+		if (result < 0)
+		{
+			spu_runtime::g_escape(_spu);
+		}
+
+		if (_spu->test_stopped())
+		{
+			_spu->pc += 4;
+			spu_runtime::g_escape(_spu);
+		}
+
+		return static_cast<u32>(result & 0xffffffff);
 	}
 
-	static s64 exec_read_in_mbox(spu_thread* _spu)
+	static u32 exec_read_in_mbox(spu_thread* _spu)
 	{
 		// TODO
-		return _spu->get_ch_value(SPU_RdInMbox);
+		return exec_rdch(_spu, SPU_RdInMbox);
 	}
 
 	static u32 exec_read_dec(spu_thread* _spu)
@@ -5203,7 +5206,7 @@ public:
 		return res;
 	}
 
-	static s64 exec_read_events(spu_thread* _spu)
+	static u32 exec_read_events(spu_thread* _spu)
 	{
 		if (const u32 events = _spu->get_events())
 		{
@@ -5211,7 +5214,7 @@ public:
 		}
 
 		// TODO
-		return _spu->get_ch_value(SPU_RdEventStat);
+		return exec_rdch(_spu, SPU_RdEventStat);
 	}
 
 	llvm::Value* get_rdch(spu_opcode_t op, u32 off, bool atomic)
@@ -5234,20 +5237,17 @@ public:
 		const auto _cur = m_ir->GetInsertBlock();
 		const auto done = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto wait = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(m_ir->CreateICmpSLT(val0, m_ir->getInt64(0)), done, wait);
+		const auto cond = m_ir->CreateICmpSLT(val0, m_ir->getInt64(0));
+		val0 = m_ir->CreateTrunc(val0, get_type<u32>());
+		m_ir->CreateCondBr(cond, done, wait);
 		m_ir->SetInsertPoint(wait);
 		const auto val1 = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra));
-		m_ir->CreateCondBr(m_ir->CreateICmpSLT(val1, m_ir->getInt64(0)), stop, done);
-		m_ir->SetInsertPoint(stop);
-		m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
 		m_ir->CreateBr(done);
 		m_ir->SetInsertPoint(done);
-		const auto rval = m_ir->CreatePHI(get_type<u64>(), 2);
+		const auto rval = m_ir->CreatePHI(get_type<u32>(), 2);
 		rval->addIncoming(val0, _cur);
 		rval->addIncoming(val1, wait);
-		rval->addIncoming(m_ir->getInt64(0), stop);
-		return m_ir->CreateTrunc(rval, get_type<u32>());
+		return rval;
 	}
 
 	void RDCH(spu_opcode_t op) //
@@ -5257,13 +5257,6 @@ public:
 		if (m_interp_magn)
 		{
 			res.value = call("spu_read_channel", &exec_rdch, m_thread, get_imm<u32>(op.ra).value);
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateRetVoid();
-			m_ir->SetInsertPoint(next);
-			res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
 			set_vr(op.rt, insert(splat<u32[4]>(0), 3, res));
 			return;
 		}
@@ -5279,14 +5272,6 @@ public:
 		{
 			update_pc();
 			res.value = call("spu_read_in_mbox", &exec_read_in_mbox, m_thread);
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
-			m_ir->CreateBr(next);
-			m_ir->SetInsertPoint(next);
-			res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
 			break;
 		}
 		case MFC_RdTagStat:
@@ -5333,14 +5318,6 @@ public:
 		{
 			update_pc();
 			res.value = call("spu_read_events", &exec_read_events, m_thread);
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
-			m_ir->CreateBr(next);
-			m_ir->SetInsertPoint(next);
-			res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
 			break;
 		}
 		case SPU_RdMachStat:
@@ -5353,14 +5330,6 @@ public:
 		{
 			update_pc();
 			res.value = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra));
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
-			m_ir->CreateBr(next);
-			m_ir->SetInsertPoint(next);
-			res.value = m_ir->CreateTrunc(res.value, get_type<u32>());
 			break;
 		}
 		}
@@ -5471,14 +5440,18 @@ public:
 		set_vr(op.rt, insert(splat<u32[4]>(0), 3, res));
 	}
 
-	static bool exec_wrch(spu_thread* _spu, u32 ch, u32 value)
+	static void exec_wrch(spu_thread* _spu, u32 ch, u32 value)
 	{
-		return _spu->set_ch_value(ch, value);
-	}
+		if (!_spu->set_ch_value(ch, value))
+		{
+			spu_runtime::g_escape(_spu);
+		}
 
-	static void exec_mfc(spu_thread* _spu)
-	{
-		return _spu->do_mfc();
+		if (_spu->test_stopped())
+		{
+			_spu->pc += 4;
+			spu_runtime::g_escape(_spu);
+		}
 	}
 
 	static void exec_list_unstall(spu_thread* _spu, u32 tag)
@@ -5491,12 +5464,21 @@ public:
 			}
 		}
 
-		return exec_mfc(_spu);
+		_spu->do_mfc();
 	}
 
-	static bool exec_mfc_cmd(spu_thread* _spu)
+	static void exec_mfc_cmd(spu_thread* _spu)
 	{
-		return _spu->process_mfc_cmd();
+		if (!_spu->process_mfc_cmd())
+		{
+			spu_runtime::g_escape(_spu);
+		}
+
+		if (_spu->test_stopped())
+		{
+			_spu->pc += 4;
+			spu_runtime::g_escape(_spu);
+		}
 	}
 
 	void WRCH(spu_opcode_t op) //
@@ -5505,13 +5487,7 @@ public:
 
 		if (m_interp_magn)
 		{
-			const auto succ = call("spu_write_channel", &exec_wrch, m_thread, get_imm<u32>(op.ra).value, val.value);
-			const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-			const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-			m_ir->CreateCondBr(succ, next, stop);
-			m_ir->SetInsertPoint(stop);
-			m_ir->CreateRetVoid();
-			m_ir->SetInsertPoint(next);
+			call("spu_write_channel", &exec_wrch, m_thread, get_imm<u32>(op.ra).value, val.value);
 			return;
 		}
 
@@ -5922,14 +5898,7 @@ public:
 		}
 
 		update_pc();
-		const auto succ = call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(succ, next, stop);
-		m_ir->SetInsertPoint(stop);
-		m_ir->CreateStore(m_ir->getFalse(), m_fake_global1, true);
-		m_ir->CreateBr(next);
-		m_ir->SetInsertPoint(next);
+		call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value);
 	}
 
 	void LNOP(spu_opcode_t op) //
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 2fbc731501..df632ed807 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -29,36 +29,39 @@ static const bool s_tsx_avx = utils::has_avx();
 // For special case
 static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
 
-#ifdef _MSC_VER
-bool operator ==(const u128& lhs, const u128& rhs)
+static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs)
 {
-	return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
+	const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]);
+	const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]);
+	const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
+	const v128 d = (lhs[6] ^ rhs[6]) | (lhs[7] ^ rhs[7]);
+	const v128 r = (a | b) | (c | d);
+	return !(r._u64[0] | r._u64[1]);
 }
-#endif
 
-static FORCE_INLINE void mov_rdata(u128* const dst, const u128* const src)
+static FORCE_INLINE void mov_rdata(decltype(spu_thread::rdata)& dst, const decltype(spu_thread::rdata)& src)
 {
 	{
-		const u128 data0 = src[0];
-		const u128 data1 = src[1];
-		const u128 data2 = src[2];
+		const v128 data0 = src[0];
+		const v128 data1 = src[1];
+		const v128 data2 = src[2];
 		dst[0] = data0;
 		dst[1] = data1;
 		dst[2] = data2;
 	}
 
 	{
-		const u128 data0 = src[3];
-		const u128 data1 = src[4];
-		const u128 data2 = src[5];
+		const v128 data0 = src[3];
+		const v128 data1 = src[4];
+		const v128 data2 = src[5];
 		dst[3] = data0;
 		dst[4] = data1;
 		dst[5] = data2;
 	}
 
 	{
-		const u128 data0 = src[6];
-		const u128 data1 = src[7];
+		const v128 data0 = src[6];
+		const v128 data1 = src[7];
 		dst[6] = data0;
 		dst[7] = data1;
 	}
@@ -182,13 +185,15 @@ namespace spu
 	}
 }
 
-const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
 	Label fall = c.newLabel();
 	Label fail = c.newLabel();
 	Label _ret = c.newLabel();
+	Label skip = c.newLabel();
+	Label next = c.newLabel();
 
 	if (utils::has_avx() && !s_tsx_avx)
 	{
@@ -197,8 +202,6 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 
 	// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
 	c.push(x86::rbp);
-	c.push(x86::r15);
-	c.push(x86::r14);
 	c.push(x86::r13);
 	c.push(x86::r12);
 	c.push(x86::rbx);
@@ -234,8 +237,6 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.xor_(x86::r12d, x86::r12d);
 	c.mov(x86::r13, args[1]);
-	c.mov(x86::r14, args[2]);
-	c.mov(x86::r15, args[3]);
 
 	// Prepare data
 	if (s_tsx_avx)
@@ -270,10 +271,13 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 	}
 
 	// Begin transaction
-	build_transaction_enter(c, fall);
+	build_transaction_enter(c, fall, x86::r12, 4);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
+	c.jne(fail);
+	c.test(x86::qword_ptr(x86::rbx), 127);
+	c.jnz(skip);
 
 	if (s_tsx_avx)
 	{
@@ -329,24 +333,34 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
-	c.xor_(x86::eax, x86::eax);
+	c.mov(x86::eax, 1);
 	c.jmp(_ret);
 
-	// Touch memory after transaction failure
+	c.bind(skip);
+	c.xor_(x86::eax, x86::eax);
+	c.xor_(x86::r12d, x86::r12d);
+	build_transaction_abort(c, 0);
+	//c.jmp(fall);
+
 	c.bind(fall);
 	c.sar(x86::eax, 24);
 	c.js(fail);
-	c.xor_(x86::rbp, 0xf80);
-	c.lock().add(x86::qword_ptr(x86::rbp), 0);
-	c.xor_(x86::rbp, 0xf80);
 	c.lock().add(x86::qword_ptr(x86::rbx), 1);
-	c.mov(x86::r12d, 1);
+	c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::wait));
+
+	// Touch memory if transaction failed without RETRY flag on the first attempt
+	c.cmp(x86::r12, 1);
+	c.jne(next);
+	c.xor_(x86::rbp, 0xf80);
+	c.lock().add(x86::dword_ptr(x86::rbp), 0);
+	c.xor_(x86::rbp, 0xf80);
 
 	Label fall2 = c.newLabel();
-	Label next2 = c.newLabel();
+	Label fail2 = c.newLabel();
 
 	// Lightened transaction: only compare and swap data
-	Label retry = build_transaction_enter(c, fall2);
+	c.bind(next);
+	build_transaction_enter(c, fall2, x86::r12, 666);
 
 	if (s_tsx_avx)
 	{
@@ -379,7 +393,7 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 		c.ptest(x86::xmm0, x86::xmm0);
 	}
 
-	c.jnz(fail);
+	c.jnz(fail2);
 
 	if (s_tsx_avx)
 	{
@@ -402,86 +416,24 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 
 	c.xend();
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::rax, x86::r12);
+	c.mov(x86::eax, 1);
 	c.jmp(_ret);
 
-	// Touch memory after transaction failure
 	c.bind(fall2);
-	c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
-
-	if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
-	{
-		// Call yield and restore data
-		c.call(imm_ptr(&std::this_thread::yield));
-
-		if (s_tsx_avx)
-		{
-			c.vmovups(x86::ymm0, x86::yword_ptr(x86::r14, 0));
-			c.vmovups(x86::ymm1, x86::yword_ptr(x86::r14, 32));
-			c.vmovups(x86::ymm2, x86::yword_ptr(x86::r14, 64));
-			c.vmovups(x86::ymm3, x86::yword_ptr(x86::r14, 96));
-			c.vmovups(x86::ymm4, x86::yword_ptr(x86::r15, 0));
-			c.vmovups(x86::ymm5, x86::yword_ptr(x86::r15, 32));
-			c.vmovups(x86::ymm6, x86::yword_ptr(x86::r15, 64));
-			c.vmovups(x86::ymm7, x86::yword_ptr(x86::r15, 96));
-		}
-		else
-		{
-			c.movaps(x86::xmm0, x86::oword_ptr(x86::r14, 0));
-			c.movaps(x86::xmm1, x86::oword_ptr(x86::r14, 16));
-			c.movaps(x86::xmm2, x86::oword_ptr(x86::r14, 32));
-			c.movaps(x86::xmm3, x86::oword_ptr(x86::r14, 48));
-			c.movaps(x86::xmm4, x86::oword_ptr(x86::r14, 64));
-			c.movaps(x86::xmm5, x86::oword_ptr(x86::r14, 80));
-			c.movaps(x86::xmm6, x86::oword_ptr(x86::r14, 96));
-			c.movaps(x86::xmm7, x86::oword_ptr(x86::r14, 112));
-			c.movaps(x86::xmm8, x86::oword_ptr(x86::r15, 0));
-			c.movaps(x86::xmm9, x86::oword_ptr(x86::r15, 16));
-			c.movaps(x86::xmm10, x86::oword_ptr(x86::r15, 32));
-			c.movaps(x86::xmm11, x86::oword_ptr(x86::r15, 48));
-			c.movaps(x86::xmm12, x86::oword_ptr(x86::r15, 64));
-			c.movaps(x86::xmm13, x86::oword_ptr(x86::r15, 80));
-			c.movaps(x86::xmm14, x86::oword_ptr(x86::r15, 96));
-			c.movaps(x86::xmm15, x86::oword_ptr(x86::r15, 112));
-		}
-	}
-	else
-	{
-		Label loop1 = c.newLabel();
-		c.mov(x86::eax, x86::r12d);
-		c.and_(x86::eax, 0xf);
-		c.shl(x86::eax, 3);
-		c.or_(x86::eax, 1);
-		c.bind(loop1);
-		c.pause();
-		c.dec(x86::eax);
-		c.jnz(loop1);
-	}
-
-	c.movzx(x86::eax, x86::r12b);
-	c.not_(x86::al);
-	c.shl(x86::eax, 4);
-	c.xor_(x86::rbp, x86::rax);
-	c.lock().add(x86::qword_ptr(x86::rbp), 0);
-	c.xor_(x86::rbp, x86::rax);
-	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
-	c.and_(x86::rax, -128);
-	c.cmp(x86::rax, x86::r13);
-	c.jne(fail);
-	c.cmp(x86::r12, 16);
-	c.jb(retry);
-	c.mov(x86::rax, imm_ptr(&g_cfg.core.spu_accurate_putllc.get()));
-	c.test(x86::byte_ptr(x86::rax), 1);
-	c.jnz(retry);
+	c.sar(x86::eax, 24);
+	c.js(fail2);
+	c.mov(x86::eax, 2);
+	c.jmp(_ret);
 
 	c.bind(fail);
 	build_transaction_abort(c, 0xff);
-	c.test(x86::r12, x86::r12);
-	c.jz(next2);
+	c.xor_(x86::eax, x86::eax);
+	c.jmp(_ret);
+
+	c.bind(fail2);
+	build_transaction_abort(c, 0xff);
 	c.lock().sub(x86::qword_ptr(x86::rbx), 1);
-	c.bind(next2);
-	c.mov(x86::rax, x86::r12);
-	c.not_(x86::rax);
+	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
 
 	c.bind(_ret);
@@ -516,13 +468,11 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const
 	c.pop(x86::rbx);
 	c.pop(x86::r12);
 	c.pop(x86::r13);
-	c.pop(x86::r14);
-	c.pop(x86::r15);
 	c.pop(x86::rbp);
 	c.ret();
 });
 
-const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64* rtime)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -558,10 +508,9 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.xor_(x86::r12d, x86::r12d);
 	c.mov(x86::r13, args[1]);
-	c.mov(x86::qword_ptr(x86::rsp, 64), args[2]);
 
 	// Begin transaction
-	Label begin = build_transaction_enter(c, fall);
+	build_transaction_enter(c, fall, x86::r12, 16);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
 
 	if (s_tsx_avx)
@@ -605,32 +554,12 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
 	}
 
 	c.and_(x86::rax, -128);
-	c.mov(args[2], x86::qword_ptr(x86::rsp, 64));
-	c.mov(x86::qword_ptr(args[2]), x86::rax);
-	c.mov(x86::rax, x86::r12);
 	c.jmp(_ret);
 
-	// Touch memory after transaction failure
 	c.bind(fall);
-	c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
+	c.mov(x86::eax, 1);
+	//c.jmp(_ret);
 
-	if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
-	{
-		c.call(imm_ptr(&std::this_thread::yield));
-	}
-	else
-	{
-		c.mov(args[0], 500);
-		c.call(imm_ptr(&::busy_wait));
-	}
-
-	c.xor_(x86::rbp, 0xf80);
-	c.xor_(x86::rbx, 0xf80);
-	c.mov(x86::rax, x86::qword_ptr(x86::rbp));
-	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
-	c.xor_(x86::rbp, 0xf80);
-	c.xor_(x86::rbx, 0xf80);
-	c.jmp(begin);
 	c.bind(_ret);
 
 #ifdef _WIN32
@@ -654,7 +583,7 @@ const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, u64*
 	c.ret();
 });
 
-const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u64* rtime)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_getll_inexact = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -691,7 +620,6 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.xor_(x86::r12d, x86::r12d);
 	c.mov(x86::r13, args[1]);
-	c.mov(x86::qword_ptr(x86::rsp, 64), args[2]);
 
 	// Begin copying
 	Label begin = c.newLabel();
@@ -719,14 +647,15 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
 	}
 
 	// Verify and retry if necessary.
-	c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
-	c.je(test0);
-	c.pause();
+	c.mov(args[0], x86::rax);
+	c.xor_(args[0], x86::qword_ptr(x86::rbx));
+	c.test(args[0], -128);
+	c.jz(test0);
 	c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
 	c.jmp(begin);
 
 	c.bind(test0);
-	c.test(x86::eax, 0x7f);
+	c.test(x86::eax, 127);
 	c.jz(_ret);
 	c.and_(x86::rax, -128);
 
@@ -774,8 +703,6 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
 
 	c.jz(_ret);
 	c.lea(x86::r12, x86::qword_ptr(x86::r12, 2));
-	c.mov(args[0], 500);
-	c.call(imm_ptr(&::busy_wait));
 	c.jmp(begin);
 
 	c.bind(_ret);
@@ -799,10 +726,6 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
 		c.movaps(x86::oword_ptr(x86::r13, 112), x86::xmm7);
 	}
 
-	c.mov(args[2], x86::qword_ptr(x86::rsp, 64));
-	c.mov(x86::qword_ptr(args[2]), x86::rax);
-	c.mov(x86::rax, x86::r12);
-
 #ifdef _WIN32
 	if (!s_tsx_avx)
 	{
@@ -826,12 +749,14 @@ const auto spu_getll_fast = build_function_asm<u64(*)(u32 raddr, void* rdata, u6
 	c.ret();
 });
 
-const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, spu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
 	Label fall = c.newLabel();
 	Label _ret = c.newLabel();
+	Label skip = c.newLabel();
+	Label next = c.newLabel();
 
 	if (utils::has_avx() && !s_tsx_avx)
 	{
@@ -884,7 +809,9 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 	}
 
 	// Begin transaction
-	build_transaction_enter(c, fall);
+	build_transaction_enter(c, fall, x86::r12, 8);
+	c.test(x86::dword_ptr(x86::rbx), 127);
+	c.jnz(skip);
 
 	if (s_tsx_avx)
 	{
@@ -907,21 +834,31 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
-	c.xor_(x86::eax, x86::eax);
+	c.mov(x86::eax, 1);
 	c.jmp(_ret);
 
-	// Touch memory after transaction failure
+	c.bind(skip);
+	c.xor_(x86::eax, x86::eax);
+	c.xor_(x86::r12d, x86::r12d);
+	build_transaction_abort(c, 0);
+	//c.jmp(fall);
+
 	c.bind(fall);
-	c.xor_(x86::rbp, 0xf80);
-	c.lock().add(x86::qword_ptr(x86::rbp), 0);
-	c.xor_(x86::rbp, 0xf80);
 	c.lock().add(x86::qword_ptr(x86::rbx), 1);
-	c.mov(x86::r12d, 1);
+	c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state)), static_cast<u32>(cpu_flag::wait));
+
+	// Touch memory if transaction failed without RETRY flag on the first attempt
+	c.cmp(x86::r12, 1);
+	c.jne(next);
+	c.xor_(x86::rbp, 0xf80);
+	c.lock().add(x86::dword_ptr(x86::rbp), 0);
+	c.xor_(x86::rbp, 0xf80);
 
 	Label fall2 = c.newLabel();
 
 	// Lightened transaction
-	Label retry = build_transaction_enter(c, fall2);
+	c.bind(next);
+	build_transaction_enter(c, fall2, x86::r12, 666);
 
 	if (s_tsx_avx)
 	{
@@ -944,57 +881,12 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 
 	c.xend();
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::rax, x86::r12);
+	c.mov(x86::eax, 1);
 	c.jmp(_ret);
 
-	// Touch memory after transaction failure
 	c.bind(fall2);
-	c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
-
-	if (s_tsx_haswell || std::thread::hardware_concurrency() < 12)
-	{
-		// Call yield and restore data
-		c.call(imm_ptr(&std::this_thread::yield));
-
-		if (s_tsx_avx)
-		{
-			c.vmovups(x86::ymm0, x86::yword_ptr(x86::r13, 0));
-			c.vmovups(x86::ymm1, x86::yword_ptr(x86::r13, 32));
-			c.vmovups(x86::ymm2, x86::yword_ptr(x86::r13, 64));
-			c.vmovups(x86::ymm3, x86::yword_ptr(x86::r13, 96));
-		}
-		else
-		{
-			c.movaps(x86::xmm0, x86::oword_ptr(x86::r13, 0));
-			c.movaps(x86::xmm1, x86::oword_ptr(x86::r13, 16));
-			c.movaps(x86::xmm2, x86::oword_ptr(x86::r13, 32));
-			c.movaps(x86::xmm3, x86::oword_ptr(x86::r13, 48));
-			c.movaps(x86::xmm4, x86::oword_ptr(x86::r13, 64));
-			c.movaps(x86::xmm5, x86::oword_ptr(x86::r13, 80));
-			c.movaps(x86::xmm6, x86::oword_ptr(x86::r13, 96));
-			c.movaps(x86::xmm7, x86::oword_ptr(x86::r13, 112));
-		}
-	}
-	else
-	{
-		Label loop1 = c.newLabel();
-		c.mov(x86::eax, x86::r12d);
-		c.and_(x86::eax, 0xf);
-		c.shl(x86::eax, 3);
-		c.or_(x86::eax, 1);
-		c.bind(loop1);
-		c.pause();
-		c.dec(x86::eax);
-		c.jnz(loop1);
-	}
-
-	c.movzx(x86::eax, x86::r12b);
-	c.not_(x86::al);
-	c.shl(x86::eax, 4);
-	c.xor_(x86::rbp, x86::rax);
-	c.lock().add(x86::qword_ptr(x86::rbp), 0);
-	c.xor_(x86::rbp, x86::rax);
-	c.jmp(retry);
+	c.mov(x86::eax, 2);
+	//c.jmp(_ret);
 
 	c.bind(_ret);
 
@@ -1486,7 +1378,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 
 				while (size)
 				{
-					*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
+					*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
 
 					dst += 16;
 					src += 16;
@@ -1501,7 +1393,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 
 			while (size >= 128)
 			{
-				mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
+				mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
 
 				dst += 128;
 				src += 128;
@@ -1510,7 +1402,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 
 			while (size)
 			{
-				*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
+				*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
 
 				dst += 16;
 				src += 16;
@@ -1556,7 +1448,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 	{
 		while (size >= 128)
 		{
-			mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
+			mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
 
 			dst += 128;
 			src += 128;
@@ -1565,7 +1457,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 
 		while (size)
 		{
-			*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
+			*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
 
 			dst += 16;
 			src += 16;
@@ -1690,7 +1582,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 	if (raddr && addr == raddr)
 	{
 		// Last check for event before we clear the reservation
-		if ((vm::reservation_acquire(addr, 128) & -128) != rtime || rdata != vm::_ref<decltype(rdata)>(addr))
+		if ((vm::reservation_acquire(addr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(addr)))
 		{
 			ch_event_stat |= SPU_EVENT_LR;
 		}
@@ -1703,11 +1595,31 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 	// Store unconditionally
 	if (LIKELY(g_use_rtm))
 	{
-		const u64 count = spu_putlluc_tx(addr, to_write.data());
+		const u32 result = spu_putlluc_tx(addr, to_write.data(), this);
 
-		if (count >= 10)
+		if (result == 2)
 		{
-			LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
+			cpu_thread::suspend_all cpu_lock(this);
+
+			// Try to obtain bit 7 (+64)
+			if (!atomic_storage<u64>::bts(vm::reservation_acquire(addr, 128).raw(), 6))
+			{
+				auto& data = vm::_ref<decltype(rdata)>(addr);
+				mov_rdata(data, to_write);
+
+				// Keep checking written data against a rogue transaction sneak in
+				while (std::atomic_thread_fence(std::memory_order_seq_cst), !cmp_rdata(data, to_write))
+				{
+					mov_rdata(data, to_write);
+				}
+
+				vm::reservation_acquire(addr, 128) += 63;
+			}
+			else
+			{
+				// Give up if another PUTLLUC command took precedence
+				vm::reservation_acquire(addr, 128) -= 1;
+			}
 		}
 	}
 	else
@@ -1722,12 +1634,12 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 			// Full lock (heavyweight)
 			// TODO: vm::check_addr
 			vm::writer_lock lock(addr);
-			mov_rdata(data.data(), to_write.data());
+			mov_rdata(data, to_write);
 			res.release(res.load() + 127);
 		}
 		else
 		{
-			mov_rdata(data.data(), to_write.data());
+			mov_rdata(data, to_write);
 			res.release(res.load() + 127);
 		}
 	}
@@ -1847,6 +1759,8 @@ bool spu_thread::process_mfc_cmd()
 	// Stall infinitely if MFC queue is full
 	while (UNLIKELY(mfc_size >= 16))
 	{
+		state += cpu_flag::wait;
+
 		if (is_stopped())
 		{
 			return false;
@@ -1873,8 +1787,10 @@ bool spu_thread::process_mfc_cmd()
 		{
 			rtime = vm::reservation_acquire(addr, 128) & -128;
 
-			while (rdata == data && (vm::reservation_acquire(addr, 128)) == rtime)
+			while (cmp_rdata(rdata, data) && (vm::reservation_acquire(addr, 128)) == rtime)
 			{
+				state += cpu_flag::wait;
+
 				if (is_stopped())
 				{
 					break;
@@ -1882,15 +1798,40 @@ bool spu_thread::process_mfc_cmd()
 
 				thread_ctrl::wait_for(100);
 			}
+
+			if (test_stopped())
+			{
+				return false;
+			}
 		}
 
-		if (LIKELY(g_use_rtm))
+		if (LIKELY(g_use_rtm && !g_cfg.core.spu_accurate_getllar && raddr != addr))
 		{
-			const u64 count = g_cfg.core.spu_accurate_getllar ? spu_getll_tx(addr, dst.data(), &ntime) : spu_getll_fast(addr, dst.data(), &ntime);
+			// TODO: maybe always start from a transaction
+			ntime = spu_getll_inexact(addr, dst.data());
+		}
+		else if (g_use_rtm)
+		{
+			ntime = spu_getll_tx(addr, dst.data());
 
-			if (count >= 10)
+			if (ntime == 1)
 			{
-				LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count);
+				if (!g_cfg.core.spu_accurate_getllar)
+				{
+					ntime = spu_getll_inexact(addr, dst.data());
+				}
+				else
+				{
+					cpu_thread::suspend_all cpu_lock(this);
+
+					while (vm::reservation_acquire(addr, 128) & 127)
+					{
+						busy_wait(100);
+					}
+
+					ntime = vm::reservation_acquire(addr, 128);
+					mov_rdata(dst, data);
+				}
 			}
 		}
 		else
@@ -1907,37 +1848,37 @@ bool spu_thread::process_mfc_cmd()
 				vm::writer_lock lock(addr);
 
 				ntime = old_time;
-				mov_rdata(dst.data(), data.data());
+				mov_rdata(dst, data);
 				res.release(old_time);
 			}
 			else
 			{
 				ntime = old_time;
-				mov_rdata(dst.data(), data.data());
+				mov_rdata(dst, data);
 				res.release(old_time);
 			}
 		}
 
-		if (const u32 _addr = raddr)
+		if (raddr && raddr != addr)
 		{
 			// Last check for event before we replace the reservation with a new one
-			if ((vm::reservation_acquire(_addr, 128) & -128) != rtime || rdata != vm::_ref<decltype(rdata)>(_addr))
+			if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
+			{
+				ch_event_stat |= SPU_EVENT_LR;
+			}
+		}
+		else if (raddr == addr)
+		{
+			// Lost previous reservation on polling
+			if (ntime != rtime || !cmp_rdata(rdata, dst))
 			{
 				ch_event_stat |= SPU_EVENT_LR;
-
-				if (_addr == addr)
-				{
-					// Lost current reservation
-					raddr = 0;
-					ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
-					return true;
-				}
 			}
 		}
 
 		raddr = addr;
 		rtime = ntime;
-		mov_rdata(rdata.data(), dst.data());
+		mov_rdata(rdata, dst);
 
 		ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
 		return true;
@@ -1949,29 +1890,39 @@ bool spu_thread::process_mfc_cmd()
 		const u32 addr = ch_mfc_cmd.eal & -128u;
 		u32 result = 0;
 
-		if (raddr == addr && rtime == (vm::reservation_acquire(raddr, 128) & -128))
+		if (raddr == addr)
 		{
 			const auto& to_write = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
 
 			if (LIKELY(g_use_rtm))
 			{
-				u64 count = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
+				result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
 
-				if ((count >> 63) == 0)
+				if (result == 2)
 				{
-					result = 1;
-				}
-				else
-				{
-					count = ~count;
-				}
+					result = 0;
 
-				if (count >= 10)
-				{
-					LOG_ERROR(SPU, "%s took too long: %u (r=%u)", ch_mfc_cmd.cmd, count, result);
+					cpu_thread::suspend_all cpu_lock(this);
+
+					// Give up if other PUTLLC/PUTLLUC commands are in progress
+					if (!vm::reservation_acquire(addr, 128).try_dec(rtime + 1))
+					{
+						auto& data = vm::_ref<decltype(rdata)>(addr);
+
+						if ((vm::reservation_acquire(addr, 128) & -128) == rtime && cmp_rdata(rdata, data))
+						{
+							mov_rdata(data, to_write);
+							vm::reservation_acquire(addr, 128) += 127;
+							result = 1;
+						}
+						else
+						{
+							vm::reservation_acquire(addr, 128) -= 1;
+						}
+					}
 				}
 			}
-			else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data)
+			else if (auto& data = vm::_ref<decltype(rdata)>(addr); rtime == (vm::reservation_acquire(raddr, 128) & -128) && cmp_rdata(rdata, data))
 			{
 				auto& res = vm::reservation_lock(raddr, 128);
 				const u64 old_time = res.load() & -128;
@@ -1984,9 +1935,9 @@ bool spu_thread::process_mfc_cmd()
 					// TODO: vm::check_addr
 					vm::writer_lock lock(addr);
 
-					if (rdata == data)
+					if (cmp_rdata(rdata, data))
 					{
-						mov_rdata(data.data(), to_write.data());
+						mov_rdata(data, to_write);
 						res.release(old_time + 128);
 						result = 1;
 					}
@@ -2012,7 +1963,7 @@ bool spu_thread::process_mfc_cmd()
 			if (raddr)
 			{
 				// Last check for event before we clear the reservation
-				if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || rdata != vm::_ref<decltype(rdata)>(raddr))
+				if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
 				{
 					ch_event_stat |= SPU_EVENT_LR;
 				}
@@ -2164,7 +2115,7 @@ u32 spu_thread::get_events(bool waiting)
 	}
 
 	// Check reservation status and set SPU_EVENT_LR if lost
-	if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
+	if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr))))
 	{
 		ch_event_stat |= SPU_EVENT_LR;
 		raddr = 0;
@@ -2256,6 +2207,11 @@ s64 spu_thread::get_ch_value(u32 ch)
 
 	auto read_channel = [&](spu_channel& channel) -> s64
 	{
+		if (channel.get_count() == 0)
+		{
+			state += cpu_flag::wait;
+		}
+
 		for (int i = 0; i < 10 && channel.get_count() == 0; i++)
 		{
 			busy_wait();
@@ -2273,6 +2229,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 			thread_ctrl::wait();
 		}
 
+		check_state();
 		return out;
 	};
 
@@ -2284,6 +2241,11 @@ s64 spu_thread::get_ch_value(u32 ch)
 	}
 	case SPU_RdInMbox:
 	{
+		if (ch_in_mbox.get_count() == 0)
+		{
+			state += cpu_flag::wait;
+		}
+
 		while (true)
 		{
 			for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++)
@@ -2300,6 +2262,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 					int_ctrl[2].set(SPU_INT2_STAT_SPU_MAILBOX_THRESHOLD_INT);
 				}
 
+				check_state();
 				return out;
 			}
 
@@ -2410,6 +2373,8 @@ s64 spu_thread::get_ch_value(u32 ch)
 
 			while (res = get_events(), !res)
 			{
+				state += cpu_flag::wait;
+
 				if (is_stopped())
 				{
 					return -1;
@@ -2418,11 +2383,14 @@ s64 spu_thread::get_ch_value(u32 ch)
 				pseudo_lock.wait(100);
 			}
 
+			check_state();
 			return res;
 		}
 
 		while (res = get_events(true), !res)
 		{
+			state += cpu_flag::wait;
+
 			if (is_stopped())
 			{
 				return -1;
@@ -2431,6 +2399,7 @@ s64 spu_thread::get_ch_value(u32 ch)
 			thread_ctrl::wait_for(100);
 		}
 
+		check_state();
 		return res;
 	}
 
@@ -2463,6 +2432,8 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 		{
 			while (!ch_out_intr_mbox.try_push(value))
 			{
+				state += cpu_flag::wait;
+
 				if (is_stopped())
 				{
 					return false;
@@ -2472,9 +2443,12 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 			}
 
 			int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT);
+			check_state();
 			return true;
 		}
 
+		state += cpu_flag::wait;
+
 		const u32 code = value >> 24;
 		{
 			if (code < 64)
@@ -2609,6 +2583,8 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 	{
 		while (!ch_out_mbox.try_push(value))
 		{
+			state += cpu_flag::wait;
+
 			if (is_stopped())
 			{
 				return false;
@@ -2617,6 +2593,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 			thread_ctrl::wait();
 		}
 
+		check_state();
 		return true;
 	}
 
@@ -2770,6 +2747,7 @@ bool spu_thread::stop_and_signal(u32 code)
 
 	if (offset >= RAW_SPU_BASE_ADDR)
 	{
+		state += cpu_flag::wait;
 		status.atomic_op([code](u32& status)
 		{
 			status = (status & 0xffff) | (code << 16);
@@ -2779,6 +2757,7 @@ bool spu_thread::stop_and_signal(u32 code)
 
 		int_ctrl[2].set(SPU_INT2_STAT_SPU_STOP_AND_SIGNAL_INT);
 		state += cpu_flag::stop;
+		check_state();
 		return true;
 	}
 
@@ -2808,6 +2787,8 @@ bool spu_thread::stop_and_signal(u32 code)
 		// HACK: wait for executable code
 		while (!_ref<u32>(pc))
 		{
+			state += cpu_flag::wait;
+
 			if (is_stopped())
 			{
 				return false;
@@ -2816,12 +2797,15 @@ bool spu_thread::stop_and_signal(u32 code)
 			thread_ctrl::wait_for(1000);
 		}
 
+		check_state();
 		return false;
 	}
 
 	case 0x001:
 	{
+		state += cpu_flag::wait;
 		thread_ctrl::wait_for(1000); // hack
+		check_state();
 		return true;
 	}
 
@@ -2857,6 +2841,8 @@ bool spu_thread::stop_and_signal(u32 code)
 
 		std::shared_ptr<lv2_event_queue> queue;
 
+		state += cpu_flag::wait;
+
 		while (true)
 		{
 			queue.reset();
@@ -2897,6 +2883,7 @@ bool spu_thread::stop_and_signal(u32 code)
 
 			if (!queue)
 			{
+				check_state();
 				return ch_in_mbox.set_values(1, CELL_EINVAL), true; // TODO: check error value
 			}
 
@@ -2927,6 +2914,7 @@ bool spu_thread::stop_and_signal(u32 code)
 				const auto data3 = static_cast<u32>(std::get<3>(event));
 				ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3);
 				queue->events.pop_front();
+				check_state();
 				return true;
 			}
 		}
@@ -2972,6 +2960,7 @@ bool spu_thread::stop_and_signal(u32 code)
 			}
 		}
 
+		check_state();
 		return true;
 	}
 
@@ -3045,6 +3034,8 @@ bool spu_thread::stop_and_signal(u32 code)
 	{
 		/* ===== sys_spu_thread_group_exit ===== */
 
+		state += cpu_flag::wait;
+
 		u32 value = 0;
 
 		if (!ch_out_mbox.try_pop(value))
@@ -3069,6 +3060,7 @@ bool spu_thread::stop_and_signal(u32 code)
 		group->join_state = SYS_SPU_THREAD_GROUP_JOIN_GROUP_EXIT;
 
 		state += cpu_flag::stop;
+		check_state();
 		return true;
 	}
 
@@ -3076,6 +3068,8 @@ bool spu_thread::stop_and_signal(u32 code)
 	{
 		/* ===== sys_spu_thread_exit ===== */
 
+		state += cpu_flag::wait;
+
 		if (!ch_out_mbox.get_count())
 		{
 			fmt::throw_exception("sys_spu_thread_exit(): Out_MBox is empty" HERE);
@@ -3084,6 +3078,7 @@ bool spu_thread::stop_and_signal(u32 code)
 		LOG_TRACE(SPU, "sys_spu_thread_exit(status=0x%x)", ch_out_mbox.get_value());
 		status |= SPU_STATUS_STOPPED_BY_STOP;
 		state += cpu_flag::stop;
+		check_state();
 		return true;
 	}
 	}
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index 177ed87d9b..b79075a2f5 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -529,7 +529,7 @@ public:
 
 	// Reservation Data
 	u64 rtime = 0;
-	std::array<u128, 8> rdata{};
+	std::array<v128, 8> rdata{};
 	u32 raddr = 0;
 
 	u32 srr0;
diff --git a/rpcs3/Emu/Cell/lv2/sys_net.cpp b/rpcs3/Emu/Cell/lv2/sys_net.cpp
index 8d08b6aa10..ea3d87d302 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net.cpp
@@ -357,6 +357,11 @@ s32 sys_net_bnet_accept(ppu_thread& ppu, s32 s, vm::ptr<sys_net_sockaddr> addr,
 		}
 	}
 
+	if (ppu.is_stopped())
+	{
+		return 0;
+	}
+
 	auto newsock = std::make_shared<lv2_socket>(native_socket);
 
 	result = idm::import_existing<lv2_socket>(newsock);
@@ -975,6 +980,11 @@ s32 sys_net_bnet_recvfrom(ppu_thread& ppu, s32 s, vm::ptr<void> buf, u32 len, s3
 		}
 	}
 
+	if (ppu.is_stopped())
+	{
+		return 0;
+	}
+
 	// TODO
 	if (addr)
 	{
@@ -1796,6 +1806,11 @@ s32 sys_net_bnet_select(ppu_thread& ppu, s32 nfds, vm::ptr<sys_net_fd_set> readf
 		}
 	}
 
+	if (ppu.is_stopped())
+	{
+		return 0;
+	}
+
 	if (readfds)
 		*readfds = rread;
 	if (writefds)
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index 0e4fed623e..1a6dd0a623 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -172,6 +172,8 @@ namespace vm
 
 	void temporary_unlock(cpu_thread& cpu) noexcept
 	{
+		cpu.state += cpu_flag::wait;
+
 		if (g_tls_locked && g_tls_locked->compare_and_swap_test(&cpu, nullptr))
 		{
 			cpu.cpu_unmem();
diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp
index 71d0d66f77..909120786a 100644
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@@ -936,11 +936,18 @@ void Emulator::Load(const std::string& title_id, bool add_only, bool force_globa
 
 		// Set RTM usage
 		g_use_rtm = utils::has_rtm() && ((utils::has_mpx() && g_cfg.core.enable_TSX == tsx_usage::enabled) || g_cfg.core.enable_TSX == tsx_usage::forced);
+
 		if (g_use_rtm && !utils::has_mpx())
 		{
 			LOG_WARNING(GENERAL, "TSX forced by User");
 		}
 
+		if (g_use_rtm && g_cfg.core.preferred_spu_threads)
+		{
+			g_cfg.core.preferred_spu_threads.set(0);
+			LOG_ERROR(GENERAL, "Preferred SPU Threads forcefully disabled - not compatible with TSX in this version.");
+		}
+
 		// Load patches from different locations
 		fxm::check_unlocked<patch_engine>()->append(fs::get_config_dir() + "data/" + m_title_id + "/patch.yml");
 
diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h
index a6c939b1c8..cdf38b1027 100644
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@@ -385,7 +385,6 @@ struct cfg_root : cfg::node
 		cfg::_enum<spu_block_size_type> spu_block_size{this, "SPU Block Size", spu_block_size_type::safe};
 		cfg::_bool spu_accurate_getllar{this, "Accurate GETLLAR", false};
 		cfg::_bool spu_accurate_putlluc{this, "Accurate PUTLLUC", false};
-		cfg::_bool spu_accurate_putllc{this, "Accurate PUTLLC", false};
 		cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled
 		cfg::_bool spu_cache{this, "SPU Cache", true};
 		cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully