SPU LLVM: Optimize branches following byteswaps

- The first element can be extracted via vmovd rather than vpextrd, which saves 1 uop.
2024-11-26 04:32:35 +01:00 · 2021-09-29 05:09:42 -04:00 · 2021-09-29 05:09:42 -04:00 · 86716dc37b
commit 86716dc37b
parent f9ab077908
1 changed files with 59 additions and 1 deletions
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -9232,6 +9232,20 @@ public:
 	{
 		if (m_block) m_block->block_end = m_ir->GetInsertBlock();

+		const auto rt = get_vr<u8[16]>(op.rt);
+
+		// Checking for zero doeesn't care about the order of the bytes,
+		// so load the data before it's byteswapped
+		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
+		{
+			m_block->block_end = m_ir->GetInsertBlock();
+			const auto cond = eval(extract(bitcast<u32[4]>(as), 0) == 0);
+			const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
+			const auto target = add_block_indirect(op, addr);
+			m_ir->CreateCondBr(cond.value, target, add_block_next());
+			return;
+		}
+
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{
@ -9263,6 +9277,21 @@ public:
 	{
 		if (m_block) m_block->block_end = m_ir->GetInsertBlock();

+		const auto rt = get_vr<u8[16]>(op.rt);
+
+		// Checking for zero doeesn't care about the order of the bytes,
+		// so load the data before it's byteswapped
+		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
+		{
+			m_block->block_end = m_ir->GetInsertBlock();
+			const auto cond = eval(extract(bitcast<u32[4]>(as), 0) != 0);
+			const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
+			const auto target = add_block_indirect(op, addr);
+			m_ir->CreateCondBr(cond.value, target, add_block_next());
+			return;
+		}
+
+
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{
@ -9483,6 +9512,21 @@ public:

 		const u32 target = spu_branch_target(m_pos, op.i16);

+		const auto rt = get_vr<u8[16]>(op.rt);
+
+		// Checking for zero doeesn't care about the order of the bytes,
+		// so load the data before it's byteswapped
+		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
+		{
+			if (target != m_pos + 4)
+			{
+				m_block->block_end = m_ir->GetInsertBlock();
+				const auto cond = eval(extract(bitcast<u32[4]>(as), 0) == 0);
+				m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
+				return;
+			}
+		}
+
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{
@ -9527,6 +9571,21 @@ public:

 		const u32 target = spu_branch_target(m_pos, op.i16);

+		const auto rt = get_vr<u8[16]>(op.rt);
+
+		// Checking for zero doeesn't care about the order of the bytes,
+		// so load the data before it's byteswapped
+		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
+		{
+			if (target != m_pos + 4)
+			{
+				m_block->block_end = m_ir->GetInsertBlock();
+				const auto cond = eval(extract(bitcast<u32[4]>(as), 0) != 0);
+				m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
+				return;
+			}
+		}
+
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{
@ -9583,7 +9642,6 @@ public:
 					m_block->block_end = m_ir->GetInsertBlock();
 					const auto a = get_vr<s8[16]>(op.rt);
 					const auto cond = eval((bitcast<s16>(trunc<bool[16]>(a)) & 0x3000) == 0);
-					//const auto cond = eval((m & 0x3000) == 0);
 					m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 					return true;
 				}