diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index 17be332d07..7a33d8f975 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -57,31 +57,68 @@ namespace asmjit
 
 	// Emit xbegin and adjacent loop, return label at xbegin (don't use xabort please)
 	template <typename F>
-	[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, const asmjit::X86Gp& ctr, uint less_than, F func)
+	[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, F func)
 	{
 		Label fall = c.newLabel();
 		Label begin = c.newLabel();
 		c.jmp(begin);
 		c.bind(fall);
 
-		// First invoked after failure
-		func();
-
-		c.add(ctr, 1);
-
 		// Don't repeat on zero status (may indicate syscall or interrupt)
 		c.test(x86::eax, x86::eax);
 		c.jz(fallback);
 
+		// First invoked after failure (can fallback to proceed, or jump anywhere else)
+		func();
+
 		// Other bad statuses are ignored regardless of repeat flag (TODO)
-		c.cmp(ctr, less_than);
-		c.jae(fallback);
 		c.align(kAlignCode, 16);
 		c.bind(begin);
 		return fall;
 
 		// xbegin should be issued manually, allows to add more check before entering transaction
 	}
+
+	// Helper to spill RDX (EDX) register for RDTSC
+	inline void build_swap_rdx_with(asmjit::X86Assembler& c, std::array<X86Gp, 4>& args, const asmjit::X86Gp& with)
+	{
+#ifdef _WIN32
+		c.xchg(args[1], with);
+		args[1] = with;
+#else
+		c.xchg(args[2], with);
+		args[2] = with;
+#endif
+	}
+
+	// Get full RDTSC value into chosen register (clobbers rax/rdx or saves only rax with other target)
+	inline void build_get_tsc(asmjit::X86Assembler& c, const asmjit::X86Gp& to = asmjit::x86::rax)
+	{
+		if (&to != &x86::rax && &to != &x86::rdx)
+		{
+			// Swap to save its contents
+			c.xchg(x86::rax, to);
+		}
+
+		c.rdtsc();
+		c.shl(x86::rdx, 32);
+
+		if (&to == &x86::rax)
+		{
+			c.or_(x86::rax, x86::rdx);
+		}
+		else if (&to == &x86::rdx)
+		{
+			c.or_(x86::rdx, x86::rax);
+		}
+		else
+		{
+			// Swap back, maybe there is more effective way to do it
+			c.xchg(x86::rax, to);
+			c.mov(to.r32(), to.r32());
+			c.or_(to.r64(), x86::rdx);
+		}
+	}
 }
 
 // Build runtime function with asmjit::X86Assembler
diff --git a/Utilities/asm.h b/Utilities/asm.h
index 7a93c211ed..e56a2a121c 100644
--- a/Utilities/asm.h
+++ b/Utilities/asm.h
@@ -2,15 +2,18 @@
 
 #include "types.h"
 
+extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit1;
+
 namespace utils
 {
-	// Transaction helper (Max = max attempts) (result = pair of success and op result)
-	template <uint Max = 10, typename F, typename R = std::invoke_result_t<F>>
+	// Transaction helper (result = pair of success and op result, or just bool)
+	template <typename F, typename R = std::invoke_result_t<F>>
 	inline auto tx_start(F op)
 	{
 		uint status = -1;
 
-		for (uint i = 0; i < Max; i++)
+		for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc())
 		{
 #ifndef _MSC_VER
 			__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h
index b6e44aa891..044021d1cb 100644
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@@ -73,6 +73,11 @@ public:
 		return !!(state & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause));
 	}
 
+	bool has_pause_flag() const
+	{
+		return !!(state & cpu_flag::pause);
+	}
+
 	// Check thread type
 	u32 id_type() const
 	{
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
index 56e49ac3db..abca9012e9 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@@ -292,7 +292,7 @@ namespace _spurs
 namespace _spurs
 {
 	// Add workload
-	s32 add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
+	s32 add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
 }
 
 //s32 _cellSpursWorkloadAttributeInitialize(vm::ptr<CellSpursWorkloadAttribute> attr, u32 revision, u32 sdkVersion, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt);
@@ -2295,7 +2295,7 @@ s32 cellSpursWorkloadAttributeSetShutdownCompletionEventHook(vm::ptr<CellSpursWo
 	return CELL_OK;
 }
 
-s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
+s32 _spurs::add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
 {
 	if (!spurs || !wid || !pm)
 	{
@@ -2420,7 +2420,7 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
 
 	u32 res_wkl;
 	const auto wkl = &spurs->wklInfo(wnum);
-	vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
 	{
 		const u32 mask = op.wklMskB & ~(0x80000000u >> wnum);
 		res_wkl = 0;
@@ -2456,12 +2456,12 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
 }
 
 /// Add workload
-s32 cellSpursAddWorkload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
+s32 cellSpursAddWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
 {
 	cellSpurs.warning("cellSpursAddWorkload(spurs=*0x%x, wid=*0x%x, pm=*0x%x, size=0x%x, data=0x%llx, priority=*0x%x, minCnt=0x%x, maxCnt=0x%x)",
 		spurs, wid, pm, size, data, priority, minCnt, maxCnt);
 
-	return _spurs::add_workload(spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
+	return _spurs::add_workload(ppu, spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
 }
 
 /// Add workload
@@ -2484,7 +2484,7 @@ s32 cellSpursAddWorkloadWithAttribute(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	return _spurs::add_workload(spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
+	return _spurs::add_workload(ppu, spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
 }
 
 /// Request workload shutdown
@@ -2506,7 +2506,7 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
 
 	bool send_event;
 	s32 rc, old_state;
-	if (!vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
+	if (!vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
 	{
 		auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
 
@@ -2663,7 +2663,7 @@ s32 cellSpursRemoveWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid)
 	}
 
 	s32 rc;
-	vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
 	{
 		auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
 
@@ -3040,7 +3040,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u3
 	};
 
 	s32 res;
-	vm::reservation_op(vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
 	{
 		if (is_set)
 		{
@@ -3189,7 +3189,7 @@ s32 cellSpursEventFlagSet(ppu_thread& ppu, vm::ptr<CellSpursEventFlag> eventFlag
 	u16  pendingRecv;
 	u16  pendingRecvTaskEvents[16];
 
-	vm::reservation_op(vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
 	{
 		send        = false;
 		ppuWaitSlot = 0;
@@ -4081,7 +4081,7 @@ s32 _cellSpursSendSignal(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32
 
 	int signal;
 
-	vm::reservation_op(vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
 	{
 		const u32 signalled = op.signalled[taskId / 32];
 		const u32 running = op.running[taskId / 32];
@@ -4972,7 +4972,7 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
 	u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
 	u32 old = 0;
 
-	const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
+	const bool ok = vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
 	{
 		allow_jobchain_run = jg.zero;
 		old = jg.ncount0;
@@ -5136,7 +5136,7 @@ s32 cellSpursAddUrgentCommand(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobCha
 
 	s32 result = CELL_OK;
 
-	vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
 	{
 		for (auto& cmd : jch.urgentCmds)
 		{
diff --git a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
index 75cc6a0eb6..3083ff0725 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
@@ -2074,7 +2074,7 @@ void spursJobchainPopUrgentCommand(spu_thread& spu)
 	const auto jc = vm::unsafe_ptr_cast<CellSpursJobChain_x00>(+ctxt->jobChain);
 
 	const bool alterQueue = ctxt->unkFlag0;
-	vm::reservation_op(jc, [&](CellSpursJobChain_x00& op)
+	vm::reservation_op(spu, jc, [&](CellSpursJobChain_x00& op)
 	{
 		const auto ls = reinterpret_cast<CellSpursJobChain_x00*>(ctxt->tempAreaJobChain);
 
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 7e5844add7..0569ff44a1 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -1216,6 +1216,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 		{
 			ppu.rtime = ppu.last_ftime;
 			ppu.raddr = ppu.last_faddr;
+			ppu.last_ftime = 0;
 			return static_cast<T>(rdata << data_off >> size_off);
 		}
 
@@ -1261,7 +1262,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
 	return ppu_load_acquire_reservation<u64>(ppu, addr);
 }
 
-const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
+const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -1282,6 +1283,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.push(x86::r13);
 	c.push(x86::r12);
 	c.push(x86::rbx);
+	c.push(x86::r14);
+	c.push(x86::r15);
 	c.sub(x86::rsp, 40);
 #ifdef _WIN32
 	if (!s_tsx_avx)
@@ -1292,6 +1295,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -1305,7 +1309,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.and_(x86::rbx, -128 / 2);
 	c.prefetchw(x86::byte_ptr(x86::rbx));
 	c.and_(args[0].r32(), 63);
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
 	// Prepare data
@@ -1328,8 +1331,20 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 		c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
 	}
 
+	// Alloc r14 to stamp0
+	const auto stamp0 = x86::r14;
+	const auto stamp1 = x86::r15;
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, []{});
+	Label tx0 = build_transaction_enter(c, fall, [&]()
+	{
+		build_get_tsc(c, stamp1);
+		c.sub(stamp1, stamp0);
+		c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
+		c.xor_(x86::eax, x86::eax);
+		c.jae(fall);
+	});
 	c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.mov(x86::eax, _XABORT_EXPLICIT);
 	c.jc(fall);
@@ -1380,7 +1395,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	// Update reservation
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so finish with xend instead
@@ -1411,6 +1427,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 
 	c.bind(skip);
 	c.xend();
+	build_get_tsc(c, stamp1);
 	c.mov(x86::eax, _XABORT_EXPLICIT);
 	//c.jmp(fall);
 
@@ -1436,11 +1453,28 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.test(x86::eax, vm::rsrv_unique_lock);
 	c.jnz(fail2);
 
-	// Allow only first shared lock to proceed
+	// Check if already updated
+	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
 
-	Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, []{});
+	// Exclude some time spent on touching memory: stamp1 contains last success or failure
+	c.mov(x86::rax, stamp1);
+	c.sub(x86::rax, stamp0);
+	c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+	c.jae(fall2);
+	build_get_tsc(c, stamp1);
+	c.sub(stamp1, x86::rax);
+
+	Label tx1 = build_transaction_enter(c, fall2, [&]()
+	{
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp1);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+		c.jae(fall2);
+		c.test(x86::qword_ptr(x86::rbx), 127 - 1);
+		c.jnz(fall2);
+	});
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
 	c.prefetchw(x86::byte_ptr(x86::rbp, 64));
 
@@ -1448,8 +1482,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.jc(fall2);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
-	c.test(x86::rax, 127 - 1);
-	c.jnz(fall2);
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
@@ -1493,7 +1525,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 
 	c.xend();
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so try to finish with xend instead
@@ -1523,7 +1556,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.jmp(fail2);
 
 	c.bind(fall2);
-	c.mov(x86::eax, -1);
+	c.mov(x86::rax, -1);
 	c.jmp(_ret);
 
 	c.bind(fail2);
@@ -1550,6 +1583,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 		c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
 	}
 
+	c.mov(x86::rax, -1);
+	c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
 	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
 
@@ -1569,6 +1604,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	}
 
 	c.add(x86::rsp, 40);
+	c.pop(x86::r15);
+	c.pop(x86::r14);
 	c.pop(x86::rbx);
 	c.pop(x86::r12);
 	c.pop(x86::r13);
@@ -1634,9 +1671,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 		{
 			if (g_use_rtm) [[likely]]
 			{
-				switch (u32 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
+				switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
 				{
-				case UINT32_MAX:
+				case UINT64_MAX:
 				{
 					auto& all_data = *vm::get_super_ptr<spu_rdata_t>(addr & -128);
 					auto& sdata = *vm::get_super_ptr<atomic_be_t<u64>>(addr & -8);
@@ -1660,6 +1697,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 						break;
 					}
 
+					ppu.last_ftime = -1;
 					[[fallthrough]];
 				}
 				case 0:
@@ -1669,6 +1707,12 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 						ppu.last_fail++;
 					}
 
+					if (ppu.last_ftime != umax)
+					{
+						ppu.last_faddr = 0;
+						return false;
+					}
+
 					_m_prefetchw(ppu.rdata);
 					_m_prefetchw(ppu.rdata + 64);
 					ppu.last_faddr = addr;
@@ -1678,9 +1722,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 				}
 				default:
 				{
-					if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
+					if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
 					{
-						perf_log.warning("STCX: took too long: %u", count);
+						perf_log.warning(u8"STCX: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
 					}
 
 					break;
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 7c1dc1b762..b1d110f844 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -371,7 +371,7 @@ namespace spu
 	}
 }
 
-const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -415,6 +415,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -425,7 +426,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.shr(args[0].r32(), 1);
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.prefetchw(x86::byte_ptr(x86::rbx));
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
 	// Prepare data
@@ -460,10 +460,20 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 		c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112));
 	}
 
+	// Alloc args[0] to stamp0
+	const auto stamp0 = args[0];
+	const auto stamp1 = args[1];
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, [&]()
+	Label tx0 = build_transaction_enter(c, fall, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
+		build_get_tsc(c, stamp1);
+		c.sub(stamp1, stamp0);
+		c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
+		c.xor_(x86::eax, x86::eax);
+		c.jae(fall);
 	});
 	c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.mov(x86::eax, _XABORT_EXPLICIT);
@@ -531,7 +541,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so finish with xend instead
@@ -564,6 +575,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.bind(skip);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
+	build_get_tsc(c, stamp1);
 	c.mov(x86::eax, _XABORT_EXPLICIT);
 	//c.jmp(fall);
 
@@ -589,13 +601,28 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.test(x86::eax, vm::rsrv_unique_lock);
 	c.jnz(fail2);
 
-	// Allow only first shared lock to proceed
+	// Check if already updated
+	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
 
-	Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
+	// Exclude some time spent on touching memory: stamp1 contains last success or failure
+	c.mov(x86::rax, stamp1);
+	c.sub(x86::rax, stamp0);
+	build_get_tsc(c, stamp1);
+	c.sub(stamp1, x86::rax);
+	c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+	c.jae(fall2);
+
+	Label tx1 = build_transaction_enter(c, fall2, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp1);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+		c.jae(fall2);
+		c.test(x86::qword_ptr(x86::rbx), 127 - 1);
+		c.jnz(fall2);
 	});
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
 	c.prefetchw(x86::byte_ptr(x86::rbp, 64));
@@ -604,8 +631,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.jc(fall2);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
-	c.test(x86::rax, 127 - 1);
-	c.jnz(fall2);
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
@@ -666,7 +691,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so try to finish with xend instead
@@ -697,7 +723,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.jmp(fail2);
 
 	c.bind(fall2);
-	c.mov(x86::eax, -1);
+	c.mov(x86::rax, -1);
 	c.jmp(_ret);
 
 	c.bind(fail2);
@@ -724,6 +750,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 		c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
 	}
 
+	c.mov(x86::rax, -1);
+	c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
 	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
 
@@ -763,7 +791,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.ret();
 });
 
-const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -792,6 +820,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -802,7 +831,6 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.shr(args[0].r32(), 1);
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.prefetchw(x86::byte_ptr(x86::rbx));
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
 	// Prepare data
@@ -825,10 +853,20 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 		c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
 	}
 
+	// Alloc args[0] to stamp0
+	const auto stamp0 = args[0];
+	const auto stamp1 = args[1];
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
+	Label tx0 = build_transaction_enter(c, fall, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+		build_get_tsc(c, stamp1);
+		c.sub(stamp1, stamp0);
+		c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
+		c.xor_(x86::eax, x86::eax);
+		c.jae(fall);
 	});
 	c.xbegin(tx0);
 	c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
@@ -856,12 +894,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
-	c.mov(x86::eax, 1);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	c.bind(skip);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
+	build_get_tsc(c, stamp1);
+	c.mov(x86::eax, _XABORT_EXPLICIT);
 	//c.jmp(fall);
 
 	c.bind(fall);
@@ -881,12 +922,24 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	// Lock reservation
 	c.mov(x86::eax, 1);
 	c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
-	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.test(x86::eax, 127 - 1);
 	c.jnz(fall2);
 
-	Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
+	// Exclude some time spent on touching memory: stamp1 contains last success or failure
+	c.mov(x86::rax, stamp1);
+	c.sub(x86::rax, stamp0);
+	c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+	c.jae(fall2);
+	build_get_tsc(c, stamp1);
+	c.sub(stamp1, x86::rax);
+
+	Label tx1 = build_transaction_enter(c, fall2, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp1);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+		c.jae(fall2);
 	});
 
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
@@ -922,7 +975,8 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	c.bind(fall2);
@@ -952,7 +1006,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.ret();
 });
 
-const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
+const extern auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -979,6 +1033,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -986,13 +1041,20 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 	c.and_(args[0].r32(), 0xff80);
 	c.shr(args[0].r32(), 1);
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
+	// Alloc args[0] to stamp0
+	const auto stamp0 = args[0];
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
+	Label tx0 = build_transaction_enter(c, fall, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp0);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit1));
+		c.jae(fall);
 	});
 
 	// Check pause flag
@@ -1026,6 +1088,8 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 
 	// Store data
 	if (s_tsx_avx)
@@ -1047,9 +1111,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 		c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
 	}
 
-	c.mov(x86::eax, 1);
 	c.jmp(_ret);
-
 	c.bind(fall);
 	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
@@ -1546,6 +1608,7 @@ void spu_thread::push_snr(u32 number, u32 value)
 	const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
 	const u32 bitor_bit = (snr_config >> number) & 1;
 
+	// Redundant, g_use_rtm is checked inside tx_start now.
 	if (g_use_rtm)
 	{
 		bool channel_notify = false;
@@ -2422,9 +2485,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 
 		if (g_use_rtm) [[likely]]
 		{
-			switch (u32 count = spu_putllc_tx(addr, rtime, rdata, to_write))
+			switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
 			{
-			case UINT32_MAX:
+			case UINT64_MAX:
 			{
 				auto& data = *vm::get_super_ptr<spu_rdata_t>(addr);
 
@@ -2451,6 +2514,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 					break;
 				}
 
+				last_ftime = -1;
 				[[fallthrough]];
 			}
 			case 0:
@@ -2460,6 +2524,12 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 					last_fail++;
 				}
 
+				if (last_ftime != umax)
+				{
+					last_faddr = 0;
+					return false;
+				}
+
 				_m_prefetchw(rdata);
 				_m_prefetchw(rdata + 64);
 				last_faddr = addr;
@@ -2469,9 +2539,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 			}
 			default:
 			{
-				if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
+				if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
 				{
-					perf_log.warning("PUTLLC: took too long: %u", count);
+					perf_log.warning(u8"PUTLLC: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
 				}
 
 				break;
@@ -2566,7 +2636,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 
 	if (g_use_rtm) [[likely]]
 	{
-		const u32 result = spu_putlluc_tx(addr, to_write, cpu);
+		const u64 result = spu_putlluc_tx(addr, to_write, cpu);
 
 		if (result == 0)
 		{
@@ -2579,9 +2649,9 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 				res += 127;
 			});
 		}
-		else if (result > 60 && g_cfg.core.perf_report) [[unlikely]]
+		else if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
 		{
-			perf_log.warning("STORE128: took too long: %u", result);
+			perf_log.warning(u8"STORE128: took too long: %.3fµs (%u c)", result / (utils::get_tsc_freq() / 1000'000.), result);
 		}
 
 		static_cast<void>(cpu->test_stopped());
@@ -2796,6 +2866,7 @@ bool spu_thread::process_mfc_cmd()
 		{
 			rtime = last_ftime;
 			raddr = last_faddr;
+			last_ftime = 0;
 			mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
 
 			ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index e28053e676..6fdd29072b 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -550,17 +550,19 @@ namespace vm
 
 	void reservation_op_internal(u32 addr, std::function<bool()> func)
 	{
-		auto& res = vm::reservation_acquire(addr, 128);
+		auto& res = vm::reservation_acquire(addr, 1);
+		auto* ptr = vm::get_super_ptr(addr & -128);
 
-		cpu_thread::suspend_all(get_current_cpu_thread(), {&res}, [&]
+		cpu_thread::suspend_all(get_current_cpu_thread(), {ptr, ptr + 64, &res}, [&]
 		{
 			if (func())
 			{
-				// Success, release all locks if necessary
+				// Success, release the lock and progress
 				res += 127;
 			}
 			else
 			{
+				// Only release the lock on failure
 				res -= 1;
 			}
 		});
diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h
index 1e4b586763..be3903421c 100644
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@@ -7,6 +7,7 @@
 #include <functional>
 
 extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit2;
 
 namespace vm
 {
@@ -70,8 +71,8 @@ namespace vm
 	// TODO: remove and make it external
 	void reservation_op_internal(u32 addr, std::function<bool()> func);
 
-	template <bool Ack = false, typename T, typename AT = u32, typename F>
-	SAFE_BUFFERS inline auto reservation_op(_ptr_base<T, AT> ptr, F op)
+	template <bool Ack = false, typename CPU, typename T, typename AT = u32, typename F>
+	SAFE_BUFFERS inline auto reservation_op(CPU& cpu, _ptr_base<T, AT> ptr, F op)
 	{
 		// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
 		static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type");
@@ -94,9 +95,10 @@ namespace vm
 		{
 			// Stage 1: single optimistic transaction attempt
 			unsigned status = _XBEGIN_STARTED;
-			unsigned count = 0;
 			u64 _old = 0;
 
+			auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0;
+
 #ifndef _MSC_VER
 			__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
 #else
@@ -157,6 +159,7 @@ namespace vm
 #ifndef _MSC_VER
 			__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
 #endif
+			stamp1 = __rdtsc();
 
 			// Touch memory if transaction failed with status 0
 			if (!status)
@@ -167,12 +170,17 @@ namespace vm
 			// Stage 2: try to lock reservation first
 			_old = res.fetch_add(1);
 
-			// Also identify atomic op
-			count = 1;
+			// Compute stamps excluding memory touch
+			stamp2 = __rdtsc() - (stamp1 - stamp0);
 
-			// Start lightened transaction (TODO: tweaking)
-			for (; !(_old & rsrv_unique_lock) && count < 60; count++)
+			// Start lightened transaction
+			for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc())
 			{
+				if (cpu.has_pause_flag())
+				{
+					break;
+				}
+
 #ifndef _MSC_VER
 				__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
 #else
diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp
index a71dbb51c1..521ec42b28 100644
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@@ -55,7 +55,9 @@ LOG_CHANNEL(sys_log, "SYS");
 
 stx::manual_fixed_typemap<void> g_fixed_typemap;
 
-bool g_use_rtm;
+bool g_use_rtm = false;
+u64 g_rtm_tx_limit1 = 0;
+u64 g_rtm_tx_limit2 = 0;
 
 std::string g_cfg_defaults;
 
@@ -1019,6 +1021,14 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool
 			}
 		}
 
+		if (g_use_rtm)
+		{
+			// Update supplementary settings
+			const f64 _1ns = utils::get_tsc_freq() / 1000'000'000.;
+			g_rtm_tx_limit1 = g_cfg.core.tx_limit1_ns * _1ns;
+			g_rtm_tx_limit2 = g_cfg.core.tx_limit2_ns * _1ns;
+		}
+
 		// Load patches from different locations
 		g_fxo->get<patch_engine>()->append_title_patches(m_title_id);
 
diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h
index 2661d58f56..88d97d4115 100644
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@@ -240,3 +240,5 @@ private:
 extern Emulator Emu;
 
 extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit1;
+extern u64 g_rtm_tx_limit2;
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index ca5f74145e..7be132b865 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -66,6 +66,8 @@ struct cfg_root : cfg::node
 		cfg::_bool hle_lwmutex{ this, "HLE lwmutex" }; // Force alternative lwmutex/lwcond implementation
 		cfg::uint64 spu_llvm_lower_bound{ this, "SPU LLVM Lower Bound" };
 		cfg::uint64 spu_llvm_upper_bound{ this, "SPU LLVM Upper Bound", 0xffffffffffffffff };
+		cfg::uint64 tx_limit1_ns{this, "TSX Transaction First Limit", 800}; // In nanoseconds
+		cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds
 
 		cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100, true }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
 		cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy",