diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index a770c0a309..c541bc424b 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -1156,27 +1156,36 @@ void spu_recompiler::branch_fixed(u32 target) c->jmp(x86::rax); } -void spu_recompiler::branch_indirect(spu_opcode_t op) +void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt) { using namespace asmjit; - if (!instr_table.isValid()) + if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !jt) { - // Request instruction table - instr_table = c->newLabel(); + // Simply external call (return or indirect call) + c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); + c->xor_(qw0->r32(), qw0->r32()); } + else + { + if (!instr_table.isValid()) + { + // Request instruction table + instr_table = c->newLabel(); + } - const u32 start = instr_labels.begin()->first; - const u32 end = instr_labels.rbegin()->first + 4; + const u32 start = instr_labels.begin()->first; + const u32 end = instr_labels.rbegin()->first + 4; - // Load indirect jump address, choose between local and external - c->lea(x86::r10, x86::qword_ptr(instr_table)); - c->lea(*qw1, x86::qword_ptr(*addr, 0 - start)); - c->xor_(qw0->r32(), qw0->r32()); - c->cmp(qw1->r32(), end - start); - c->cmovae(qw1->r32(), qw0->r32()); - c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); - c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); + // Load indirect jump address, choose between local and external + c->lea(x86::r10, x86::qword_ptr(instr_table)); + c->lea(*qw1, x86::qword_ptr(*addr, 0 - start)); + c->xor_(qw0->r32(), qw0->r32()); + c->cmp(qw1->r32(), end - start); + c->cmovae(qw1->r32(), qw0->r32()); + c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); + c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); + } if (op.d) { @@ -2741,7 +2750,7 @@ void spu_recompiler::BI(spu_opcode_t op) { c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); c->and_(*addr, 0x3fffc); - branch_indirect(op); + branch_indirect(op, verify(HERE, m_targets[m_pos].size()) > 2); m_pos = -1; } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index ae45e09a13..8cb66c287c 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -107,7 +107,7 @@ private: asmjit::X86Mem XmmConst(__m128i data); void branch_fixed(u32 target); - void branch_indirect(spu_opcode_t op); + void branch_indirect(spu_opcode_t op, bool jt = false); void fall(spu_opcode_t op); void save_rcx(); void load_rcx(); diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 3b0aea7cdf..3da99e3be2 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -89,7 +89,7 @@ void spu_cache::initialize() } // SPU cache file (version + block size type) - const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v0.dat"; + const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1.dat"; auto cache = std::make_shared(loc); @@ -272,14 +272,16 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) result.push_back(lsa); // Initialize block entries - std::bitset<0x10000>& blocks = m_block_info; - blocks.reset(); - blocks.set(lsa / 4); + m_block_info.reset(); + m_block_info.set(lsa / 4); // Simple block entry workload list std::vector wl; wl.push_back(lsa); + m_regmod.fill(0xff); + m_targets.clear(); + // Value flags (TODO) enum class vf : u32 { @@ -310,9 +312,9 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (target > lsa) { // Check for redundancy - if (!blocks[target / 4]) + if (!m_block_info[target / 4]) { - blocks[target / 4] = true; + m_block_info[target / 4] = true; wl.push_back(target); return; } @@ -325,6 +327,8 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) wl[wi] += 4; + m_targets.erase(pos); + // Analyse instruction switch (const auto type = s_spu_itype.decode(data)) { @@ -336,7 +340,8 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) case spu_itype::DFTSV: { // Stop before invalid instructions (TODO) - blocks[pos / 4] = true; + m_targets[pos].push_back(-1); + m_block_info[pos / 4] = true; next_block(); continue; } @@ -349,7 +354,8 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (data == 0 || data == 3) { // Stop before null data - blocks[pos / 4] = true; + m_targets[pos].push_back(-1); + m_block_info[pos / 4] = true; next_block(); continue; } @@ -357,6 +363,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (g_cfg.core.spu_block_size != spu_block_size_type::giga) { // Stop on special instructions (TODO) + m_targets[pos].push_back(-1); next_block(); break; } @@ -366,6 +373,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) case spu_itype::IRET: { + m_targets[pos].push_back(-1); next_block(); break; } @@ -382,6 +390,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (type == spu_itype::BISL) { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; values[op.rt] = pos + 4; } @@ -389,23 +398,24 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (test(af, vf::is_const)) { const u32 target = spu_branch_target(av); + LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target); if (target == pos + 4) { // Nop (unless BISL) - break; + LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos); } + m_targets[pos].push_back(target); + if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga) { - LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target); add_block(target); } - if (type == spu_itype::BISL && target < lsa) + if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga) { - next_block(); - break; + add_block(pos + 4); } } else if (type == spu_itype::BI && !op.d && !op.e) @@ -488,6 +498,8 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) add_block(jt_abs[i]); result[(start - lsa) / 4 + 1 + i] = se_storage::swap(jt_abs[i]); } + + m_targets.emplace(pos, std::move(jt_abs)); } if (jt_rel.size() >= jt_abs.size()) @@ -504,19 +516,33 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) add_block(jt_rel[i]); result[(start - lsa) / 4 + 1 + i] = se_storage::swap(jt_rel[i] - start); } + + m_targets.emplace(pos, std::move(jt_rel)); } } } - if (type == spu_itype::BI || type == spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::safe) + if (type == spu_itype::BI || type == spu_itype::BISL) { if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga) { - next_block(); - break; + if (m_targets[pos].empty()) + { + m_targets[pos].push_back(-1); + } + } + else + { + add_block(pos + 4); } } + else + { + m_targets[pos].push_back(pos + 4); + add_block(pos + 4); + } + next_block(); break; } @@ -525,6 +551,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) { const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16); + m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; values[op.rt] = pos + 4; @@ -534,11 +561,11 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) break; } - if (target < lsa || g_cfg.core.spu_block_size != spu_block_size_type::giga) + m_targets[pos].push_back(target); + + if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga) { - // Stop on direct calls - next_block(); - break; + add_block(pos + 4); } if (g_cfg.core.spu_block_size == spu_block_size_type::giga) @@ -546,6 +573,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) add_block(target); } + next_block(); break; } @@ -564,15 +592,16 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) break; } + m_targets[pos].push_back(target); add_block(target); - if (type == spu_itype::BR || type == spu_itype::BRA) + if (type != spu_itype::BR && type != spu_itype::BRA) { - // Stop on direct branches - next_block(); - break; + m_targets[pos].push_back(pos + 4); + add_block(pos + 4); } + next_block(); break; } @@ -601,61 +630,131 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) case spu_itype::IL: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; values[op.rt] = op.si16; break; } case spu_itype::ILA: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; values[op.rt] = op.i18; break; } case spu_itype::ILH: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; values[op.rt] = op.i16 << 16 | op.i16; break; } case spu_itype::ILHU: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; values[op.rt] = op.i16 << 16; break; } case spu_itype::IOHL: { + m_regmod[pos / 4] = op.rt; values[op.rt] = values[op.rt] | op.i16; break; } case spu_itype::ORI: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = vflags[op.ra] & vf::is_const; values[op.rt] = values[op.ra] | op.si10; break; } case spu_itype::OR: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; values[op.rt] = values[op.ra] | values[op.rb]; break; } + case spu_itype::ANDI: + { + m_regmod[pos / 4] = op.rt; + vflags[op.rt] = vflags[op.ra] & vf::is_const; + values[op.rt] = values[op.ra] & op.si10; + break; + } + case spu_itype::AND: + { + m_regmod[pos / 4] = op.rt; + vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; + values[op.rt] = values[op.ra] & values[op.rb]; + break; + } case spu_itype::AI: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = vflags[op.ra] & vf::is_const; values[op.rt] = values[op.ra] + op.si10; break; } case spu_itype::A: { + m_regmod[pos / 4] = op.rt; vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; values[op.rt] = values[op.ra] + values[op.rb]; break; } + case spu_itype::SFI: + { + m_regmod[pos / 4] = op.rt; + vflags[op.rt] = vflags[op.ra] & vf::is_const; + values[op.rt] = op.si10 - values[op.ra]; + break; + } + case spu_itype::SF: + { + m_regmod[pos / 4] = op.rt; + vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; + values[op.rt] = values[op.rb] - values[op.ra]; + break; + } + case spu_itype::ROTMI: + { + m_regmod[pos / 4] = op.rt; + + if (-op.i7 & 0x20) + { + vflags[op.rt] = +vf::is_const; + values[op.rt] = 0; + break; + } + + vflags[op.rt] = vflags[op.ra] & vf::is_const; + values[op.rt] = values[op.ra] >> (-op.i7 & 0x1f); + break; + } + case spu_itype::SHLI: + { + m_regmod[pos / 4] = op.rt; + + if (op.i7 & 0x20) + { + vflags[op.rt] = +vf::is_const; + values[op.rt] = 0; + break; + } + + vflags[op.rt] = vflags[op.ra] & vf::is_const; + values[op.rt] = values[op.ra] << (op.i7 & 0x1f); + break; + } + default: { // Unconst - vflags[type & spu_itype::_quadrop ? +op.rt4 : +op.rt] = {}; + const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt; + m_regmod[pos / 4] = op_rt; + vflags[op_rt] = {}; break; } } @@ -783,7 +882,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator llvm::Value* m_lsptr; llvm::BasicBlock* m_stop; - llvm::GlobalVariable* m_jt; std::array, 128> m_gpr; std::array m_flush_gpr; @@ -1047,27 +1145,15 @@ public: m_stop = BasicBlock::Create(m_context, "", m_function); - const auto jtt = ArrayType::get(GetType(), m_size / 4); - std::vector jt; - jt.reserve(m_size / 4); - // Create instruction blocks for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4) { if (func[i] && m_block_info[pos / 4]) { - const auto b = BasicBlock::Create(m_context, "", m_function); - jt.push_back(llvm::BlockAddress::get(b)); - m_instr_map.emplace(pos, b); - } - else - { - jt.push_back(llvm::BlockAddress::get(m_stop)); + m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function)); } } - m_jt = new GlobalVariable(*module, jtt, true, GlobalValue::PrivateLinkage, llvm::ConstantArray::get(jtt, jt), "jt"); - update_pc(); const auto label_test = BasicBlock::Create(m_context, "", m_function); @@ -2764,24 +2850,43 @@ public: addr.value = call(&exec_check_interrupts, m_thread, addr.value); } - if (llvm::isa(addr.value)) + if (const auto _int = llvm::dyn_cast(addr.value)) { - return branch_fixed(llvm::cast(addr.value)->getZExtValue()); + LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue()); + return branch_fixed(_int->getZExtValue()); } m_ir->CreateStore(addr.value, spu_ptr(&SPUThread::pc)); - const u32 start = m_instr_map.begin()->first; - const auto local = llvm::BasicBlock::Create(m_context, "", m_function); - const auto exter = llvm::BasicBlock::Create(m_context, "", m_function); - const auto off = m_ir->CreateSub(addr.value, m_ir->getInt32(start)); - m_ir->CreateCondBr(m_ir->CreateICmpULT(off, m_ir->getInt32(m_size)), local, exter); - m_ir->SetInsertPoint(local); - const auto table = m_ir->CreateIndirectBr(m_ir->CreateLoad(m_ir->CreateGEP(m_jt, {(llvm::Value*)m_ir->getInt32(0), m_ir->CreateLShr(off, 2)})), m_instr_map.size() + 1); - for (const auto& pair : m_instr_map) - table->addDestination(pair.second); - table->addDestination(m_stop); - m_ir->SetInsertPoint(exter); + const auto tfound = m_targets.find(m_pos); + + if (tfound != m_targets.end() && tfound->second.size() >= 3) + { + const u32 start = m_instr_map.begin()->first; + + const std::set targets(tfound->second.begin(), tfound->second.end()); + + const auto exter = llvm::BasicBlock::Create(m_context, "", m_function); + + const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4); + + for (u32 pos = start; pos < start + m_size; pos += 4) + { + const auto found = m_instr_map.find(pos); + + if (found != m_instr_map.end() && targets.count(pos)) + { + sw->addCase(m_ir->getInt32(pos / 4), found->second); + } + else + { + sw->addCase(m_ir->getInt32(pos / 4), m_stop); + } + } + + m_ir->SetInsertPoint(exter); + } + const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher))); const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo()->getPointerTo(); tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext(addr << 1).value), type))); diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index d8b40df664..cc8001485b 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -5,6 +5,7 @@ #include #include #include +#include // Helper class class spu_cache @@ -35,8 +36,15 @@ protected: u32 m_pos; u32 m_size; + // Bit indicating start of the block std::bitset<0x10000> m_block_info; + // GPR modified by the instruction (-1 = not set) + std::array m_regmod; + + // List of possible targets for the instruction ({} = next instruction, {-1} = no targets) + std::unordered_map, value_hash> m_targets; + std::shared_ptr m_cache; public: