mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 02:32:36 +01:00
SPU analyser: basic function detection in Giga mode
Misc: fix EH frame registration (LLVM, non-Windows). Misc: constant-folding bitcast (cpu_translator). Misc: add syntax for LLVM arrays (cpu_translator). Misc: use function names for proper linkage (SPU LLVM). Changed function search and verification in Giga mode. Basic stack frame layout analysis. Function detection in Giga mode. Basic use of new information in SPU LLVM. Fixed jump table compilation in SPU LLVM. Disable broken optimization in Accurate xfloat mode. Make compiled SPU modules position-independent in SPU LLVM. Optimizations include but not limited to: * Compiling SPU functions as native functions when eligible * Avoiding register context write-out * Aligned stack assumption (CWD alike instruction)
This commit is contained in:
parent
fce9d6a7b8
commit
7492f335e9
@ -474,7 +474,7 @@ struct MemoryManager : llvm::RTDyldMemoryManager
|
||||
s_unfire.push_front(std::make_pair(addr, size));
|
||||
#endif
|
||||
|
||||
return RTDyldMemoryManager::registerEHFrames(addr, load_addr, size);
|
||||
return RTDyldMemoryManager::registerEHFramesInProcess(addr, size);
|
||||
}
|
||||
|
||||
void deregisterEHFrames() override
|
||||
@ -508,6 +508,10 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
|
||||
|
||||
void registerEHFrames(u8* addr, u64 load_addr, std::size_t size) override
|
||||
{
|
||||
#ifndef _WIN32
|
||||
RTDyldMemoryManager::registerEHFramesInProcess(addr, size);
|
||||
s_unfire.push_front(std::make_pair(addr, size));
|
||||
#endif
|
||||
}
|
||||
|
||||
void deregisterEHFrames() override
|
||||
@ -770,25 +774,6 @@ jit_compiler::~jit_compiler()
|
||||
{
|
||||
}
|
||||
|
||||
bool jit_compiler::has_ssse3() const
|
||||
{
|
||||
if (m_cpu == "generic" ||
|
||||
m_cpu == "k8" ||
|
||||
m_cpu == "opteron" ||
|
||||
m_cpu == "athlon64" ||
|
||||
m_cpu == "athlon-fx" ||
|
||||
m_cpu == "k8-sse3" ||
|
||||
m_cpu == "opteron-sse3" ||
|
||||
m_cpu == "athlon64-sse3" ||
|
||||
m_cpu == "amdfam10" ||
|
||||
m_cpu == "barcelona")
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void jit_compiler::add(std::unique_ptr<llvm::Module> module, const std::string& path)
|
||||
{
|
||||
ObjectCache cache{path};
|
||||
|
@ -142,9 +142,6 @@ public:
|
||||
return *m_engine;
|
||||
}
|
||||
|
||||
// Test SSSE3 feature
|
||||
bool has_ssse3() const;
|
||||
|
||||
// Add module (path to obj cache dir)
|
||||
void add(std::unique_ptr<llvm::Module> module, const std::string& path);
|
||||
|
||||
|
@ -9,7 +9,54 @@ cpu_translator::cpu_translator(llvm::Module* module, bool is_be)
|
||||
, m_module(module)
|
||||
, m_is_be(is_be)
|
||||
{
|
||||
}
|
||||
|
||||
void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine)
|
||||
{
|
||||
m_context = context;
|
||||
m_engine = &engine;
|
||||
|
||||
const auto cpu = m_engine->getTargetMachine()->getTargetCPU();
|
||||
|
||||
m_use_ssse3 = true;
|
||||
|
||||
// Test SSSE3 feature (TODO)
|
||||
if (cpu == "generic" ||
|
||||
cpu == "k8" ||
|
||||
cpu == "opteron" ||
|
||||
cpu == "athlon64" ||
|
||||
cpu == "athlon-fx" ||
|
||||
cpu == "k8-sse3" ||
|
||||
cpu == "opteron-sse3" ||
|
||||
cpu == "athlon64-sse3" ||
|
||||
cpu == "amdfam10" ||
|
||||
cpu == "barcelona")
|
||||
{
|
||||
m_use_ssse3 = false;
|
||||
}
|
||||
}
|
||||
|
||||
llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type)
|
||||
{
|
||||
uint s1 = type->getScalarSizeInBits();
|
||||
uint s2 = val->getType()->getScalarSizeInBits();
|
||||
|
||||
if (type->isVectorTy())
|
||||
s1 *= type->getVectorNumElements();
|
||||
if (val->getType()->isVectorTy())
|
||||
s2 *= val->getType()->getVectorNumElements();
|
||||
|
||||
if (s1 != s2)
|
||||
{
|
||||
fmt::throw_exception("cpu_translator::bitcast(): incompatible type sizes (%u vs %u)", s1, s2);
|
||||
}
|
||||
|
||||
if (const auto c1 = llvm::dyn_cast<llvm::Constant>(val))
|
||||
{
|
||||
return verify(HERE, llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, type, m_module->getDataLayout()));
|
||||
}
|
||||
|
||||
return m_ir->CreateBitCast(val, type);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Analysis/ConstantFolding.h"
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
@ -19,6 +20,8 @@
|
||||
#include "../Utilities/StrFmt.h"
|
||||
#include "../Utilities/BEType.h"
|
||||
#include "../Utilities/BitField.h"
|
||||
#include "../Utilities/Log.h"
|
||||
#include "../Utilities/JIT.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
@ -47,6 +50,7 @@ struct llvm_value_t
|
||||
static constexpr bool is_sint = false;
|
||||
static constexpr bool is_uint = false;
|
||||
static constexpr bool is_float = false;
|
||||
static constexpr uint is_array = false;
|
||||
static constexpr uint is_vector = false;
|
||||
static constexpr uint is_pointer = false;
|
||||
|
||||
@ -314,6 +318,7 @@ struct llvm_value_t<T*> : llvm_value_t<T>
|
||||
static constexpr bool is_sint = false;
|
||||
static constexpr bool is_uint = false;
|
||||
static constexpr bool is_float = false;
|
||||
static constexpr uint is_array = false;
|
||||
static constexpr uint is_vector = false;
|
||||
static constexpr uint is_pointer = llvm_value_t<T>::is_pointer + 1;
|
||||
|
||||
@ -333,6 +338,7 @@ struct llvm_value_t<T[N]> : llvm_value_t<T>
|
||||
using base = llvm_value_t<T>;
|
||||
using base::base;
|
||||
|
||||
static constexpr uint is_array = 0;
|
||||
static constexpr uint is_vector = N;
|
||||
static constexpr uint is_pointer = 0;
|
||||
|
||||
@ -342,6 +348,48 @@ struct llvm_value_t<T[N]> : llvm_value_t<T>
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, uint N>
|
||||
struct llvm_value_t<T[0][N]> : llvm_value_t<T>
|
||||
{
|
||||
using type = T[0][N];
|
||||
using base = llvm_value_t<T>;
|
||||
using base::base;
|
||||
|
||||
static constexpr bool is_int = false;
|
||||
static constexpr bool is_sint = false;
|
||||
static constexpr bool is_uint = false;
|
||||
static constexpr bool is_float = false;
|
||||
static constexpr uint is_array = N;
|
||||
static constexpr uint is_vector = false;
|
||||
static constexpr uint is_pointer = false;
|
||||
|
||||
static llvm::Type* get_type(llvm::LLVMContext& context)
|
||||
{
|
||||
return llvm::ArrayType::get(llvm_value_t<T>::get_type(context), N);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, uint V, uint N>
|
||||
struct llvm_value_t<T[V][N]> : llvm_value_t<T[V]>
|
||||
{
|
||||
using type = T[V][N];
|
||||
using base = llvm_value_t<T[V]>;
|
||||
using base::base;
|
||||
|
||||
static constexpr bool is_int = false;
|
||||
static constexpr bool is_sint = false;
|
||||
static constexpr bool is_uint = false;
|
||||
static constexpr bool is_float = false;
|
||||
static constexpr uint is_array = N;
|
||||
static constexpr uint is_vector = false;
|
||||
static constexpr uint is_pointer = false;
|
||||
|
||||
static llvm::Type* get_type(llvm::LLVMContext& context)
|
||||
{
|
||||
return llvm::ArrayType::get(llvm_value_t<T[V]>::get_type(context), N);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using llvm_expr_t = std::decay_t<T>;
|
||||
|
||||
@ -2368,6 +2416,9 @@ protected:
|
||||
// Module to which all generated code is output to
|
||||
llvm::Module* m_module;
|
||||
|
||||
// Execution engine from JIT instance
|
||||
llvm::ExecutionEngine* m_engine{};
|
||||
|
||||
// Endianness, affects vector element numbering (TODO)
|
||||
bool m_is_be;
|
||||
|
||||
@ -2377,6 +2428,8 @@ protected:
|
||||
// IR builder
|
||||
llvm::IRBuilder<>* m_ir;
|
||||
|
||||
void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine);
|
||||
|
||||
public:
|
||||
// Convert a C++ type to an LLVM type (TODO: remove)
|
||||
template <typename T>
|
||||
@ -2421,6 +2474,26 @@ public:
|
||||
return result;
|
||||
}
|
||||
|
||||
// Call external function: provide name and function pointer
|
||||
template <typename RT, typename... FArgs, typename... Args>
|
||||
llvm::CallInst* call(std::string_view lame, RT(*_func)(FArgs...), Args... args)
|
||||
{
|
||||
static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number");
|
||||
const auto type = llvm::FunctionType::get(get_type<RT>(), {args->getType()...}, false);
|
||||
const auto func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction({lame.data(), lame.size()}, type).getCallee());
|
||||
m_engine->addGlobalMapping({lame.data(), lame.size()}, reinterpret_cast<std::uintptr_t>(_func));
|
||||
return m_ir->CreateCall(func, {args...});
|
||||
}
|
||||
|
||||
// Bitcast with immediate constant folding
|
||||
llvm::Value* bitcast(llvm::Value* val, llvm::Type* type);
|
||||
|
||||
template <typename T>
|
||||
llvm::Value* bitcast(llvm::Value* val)
|
||||
{
|
||||
return bitcast(val, get_type<T>());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static llvm_placeholder_t<T> match()
|
||||
{
|
||||
|
@ -4677,7 +4677,7 @@ bool ppu_interpreter::MTFSB0(ppu_thread& ppu, ppu_opcode_t op)
|
||||
bool ppu_interpreter::MTFSFI(ppu_thread& ppu, ppu_opcode_t op)
|
||||
{
|
||||
const u32 bf = op.crfd * 4;
|
||||
if (bf != 4 * 4)
|
||||
if (bf != 4 * 4)
|
||||
{
|
||||
// Do nothing on non-FPCC field (TODO)
|
||||
LOG_WARNING(PPU, "MTFSFI(%d)", op.crfd);
|
||||
|
@ -1711,7 +1711,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
|
||||
module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout());
|
||||
|
||||
// Initialize translator
|
||||
PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3());
|
||||
PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.get_engine());
|
||||
|
||||
// Define some types
|
||||
const auto _void = Type::getVoidTy(jit.get_context());
|
||||
|
@ -79,7 +79,7 @@ public:
|
||||
result |= bit;
|
||||
}
|
||||
|
||||
return result;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Unpack CR bits
|
||||
|
@ -11,14 +11,13 @@ using namespace llvm;
|
||||
|
||||
const ppu_decoder<PPUTranslator> s_ppu_decoder;
|
||||
|
||||
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3)
|
||||
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, ExecutionEngine& engine)
|
||||
: cpu_translator(module, false)
|
||||
, m_info(info)
|
||||
, m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone}))
|
||||
{
|
||||
// Bind context
|
||||
m_context = context;
|
||||
m_use_ssse3 = ssse3;
|
||||
cpu_translator::initialize(context, engine);
|
||||
|
||||
// There is no weak linkage on JIT, so let's create variables with different names for each module part
|
||||
const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr;
|
||||
|
@ -315,7 +315,7 @@ public:
|
||||
// Handle compilation errors
|
||||
void CompilationError(const std::string& error);
|
||||
|
||||
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3);
|
||||
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, llvm::ExecutionEngine& engine);
|
||||
~PPUTranslator();
|
||||
|
||||
// Get thread context struct type
|
||||
|
@ -260,7 +260,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
|
||||
|
||||
void spu_load_exec(const spu_exec_object& elf)
|
||||
{
|
||||
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x40000, vm::spu));
|
||||
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu));
|
||||
auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "");
|
||||
|
||||
spu_thread::g_raw_spu_ctr++;
|
||||
|
@ -11,6 +11,7 @@ struct spu_itype
|
||||
static constexpr struct branch_tag{} branch{}; // Branch Instructions
|
||||
static constexpr struct floating_tag{} floating{}; // Floating-Point Instructions
|
||||
static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions
|
||||
static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values
|
||||
|
||||
enum type : unsigned char
|
||||
{
|
||||
@ -146,24 +147,26 @@ struct spu_itype
|
||||
FMS, // quadrop_tag last
|
||||
|
||||
FA,
|
||||
DFA,
|
||||
FS,
|
||||
DFS,
|
||||
FM,
|
||||
FREST,
|
||||
FRSQEST,
|
||||
FI,
|
||||
CSFLT,
|
||||
CUFLT,
|
||||
FRDS, // xfloat_tag last
|
||||
|
||||
DFA,
|
||||
DFS,
|
||||
DFM,
|
||||
DFMA,
|
||||
DFNMS,
|
||||
DFMS,
|
||||
DFNMA,
|
||||
FREST,
|
||||
FRSQEST,
|
||||
FI,
|
||||
CSFLT,
|
||||
CFLTS,
|
||||
CUFLT,
|
||||
CFLTU,
|
||||
FRDS,
|
||||
FESD,
|
||||
|
||||
CFLTS,
|
||||
CFLTU,
|
||||
FCEQ,
|
||||
FCMEQ,
|
||||
FCGT,
|
||||
@ -252,6 +255,12 @@ struct spu_itype
|
||||
{
|
||||
return value >= MPYA && value <= FMS;
|
||||
}
|
||||
|
||||
// Test for xfloat instruction
|
||||
friend constexpr bool operator &(type value, xfloat_tag)
|
||||
{
|
||||
return value >= FMA && value <= FRDS;
|
||||
}
|
||||
};
|
||||
|
||||
struct spu_iflag
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -44,8 +44,14 @@ class spu_runtime
|
||||
|
||||
atomic_t<u64> m_reset_count{0};
|
||||
|
||||
struct func_compare
|
||||
{
|
||||
// Comparison function for SPU programs
|
||||
bool operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const;
|
||||
};
|
||||
|
||||
// All functions
|
||||
std::map<std::vector<u32>, spu_function_t> m_map;
|
||||
std::map<std::vector<u32>, spu_function_t, func_compare> m_map;
|
||||
|
||||
// Debug module output location
|
||||
std::string m_cache_path;
|
||||
@ -57,8 +63,8 @@ class spu_runtime
|
||||
u16 from;
|
||||
u16 level;
|
||||
u8* rel32;
|
||||
std::map<std::vector<u32>, spu_function_t>::iterator beg;
|
||||
std::map<std::vector<u32>, spu_function_t>::iterator end;
|
||||
decltype(m_map)::iterator beg;
|
||||
decltype(m_map)::iterator end;
|
||||
};
|
||||
|
||||
// Scratch vector
|
||||
@ -199,6 +205,17 @@ public:
|
||||
s_reg_max
|
||||
};
|
||||
|
||||
// Classify terminator instructions
|
||||
enum class term_type : unsigned char
|
||||
{
|
||||
br,
|
||||
ret,
|
||||
call,
|
||||
fallthrough,
|
||||
indirect_call,
|
||||
interrupt_call,
|
||||
};
|
||||
|
||||
protected:
|
||||
std::shared_ptr<spu_runtime> m_spurt;
|
||||
|
||||
@ -239,12 +256,39 @@ protected:
|
||||
// Internal use flag
|
||||
bool analysed = false;
|
||||
|
||||
// Terminator instruction type
|
||||
term_type terminator;
|
||||
|
||||
// Bit mask of the registers modified in the block
|
||||
std::bitset<s_reg_max> reg_mod{};
|
||||
|
||||
// Set if last modifying instruction produces xfloat
|
||||
std::bitset<s_reg_max> reg_mod_xf{};
|
||||
|
||||
// Set if the initial register value in this block may be xfloat
|
||||
std::bitset<s_reg_max> reg_maybe_xf{};
|
||||
|
||||
// Bit mask of the registers used (before modified)
|
||||
std::bitset<s_reg_max> reg_use{};
|
||||
|
||||
// Bit mask of the trivial (u32 x 4) constant value resulting in this block
|
||||
std::bitset<s_reg_max> reg_const{};
|
||||
|
||||
// Bit mask of register saved onto the stack before use
|
||||
std::bitset<s_reg_max> reg_save_dom{};
|
||||
|
||||
// Address of the function
|
||||
u32 func = 0x40000;
|
||||
|
||||
// Value subtracted from $SP in this block, negative if something funny is done on $SP
|
||||
u32 stack_sub = 0;
|
||||
|
||||
// Constant values associated with reg_const
|
||||
std::array<u32, s_reg_max> reg_val32;
|
||||
|
||||
// Registers loaded from the stack in this block (stack offset)
|
||||
std::array<u32, s_reg_max> reg_load_mod{};
|
||||
|
||||
// Single source of the reg value (dominating block address within the same chunk) or a negative number
|
||||
std::array<u32, s_reg_max> reg_origin, reg_origin_abs;
|
||||
|
||||
@ -258,13 +302,27 @@ protected:
|
||||
// Sorted basic block info
|
||||
std::map<u32, block_info> m_bbs;
|
||||
|
||||
// Advanced block (chunk) information
|
||||
struct chunk_info
|
||||
// Sorted advanced block (chunk) list
|
||||
std::basic_string<u32> m_chunks;
|
||||
|
||||
// Function information
|
||||
struct func_info
|
||||
{
|
||||
// Size to the end of last basic block
|
||||
u16 size = 0;
|
||||
|
||||
// Determines whether a function is eligible for optimizations
|
||||
bool good = false;
|
||||
|
||||
// Call targets
|
||||
std::basic_string<u32> calls;
|
||||
|
||||
// Register save info (stack offset)
|
||||
std::array<u32, s_reg_max> reg_save_off{};
|
||||
};
|
||||
|
||||
// Sorted chunk info
|
||||
std::map<u32, chunk_info> m_chunks;
|
||||
// Sorted function info
|
||||
std::map<u32, func_info> m_funcs;
|
||||
|
||||
std::shared_ptr<spu_cache> m_cache;
|
||||
|
||||
@ -272,6 +330,9 @@ private:
|
||||
// For private use
|
||||
std::bitset<0x10000> m_bits;
|
||||
|
||||
// For private use
|
||||
std::vector<u32> workload;
|
||||
|
||||
// Result of analyse(), to avoid copying and allocation
|
||||
std::vector<u32> result;
|
||||
|
||||
|
@ -579,6 +579,10 @@ public:
|
||||
u64 block_recover = 0;
|
||||
u64 block_failure = 0;
|
||||
|
||||
u64 saved_native_sp = 0; // Host thread's stack pointer for emulated longjmp
|
||||
|
||||
u8* memory_base_addr = vm::g_base_addr;
|
||||
|
||||
std::array<v128, 0x4000> stack_mirror; // Return address information
|
||||
|
||||
void push_snr(u32 number, u32 value);
|
||||
|
@ -232,7 +232,7 @@ error_code sys_spu_thread_initialize(vm::ptr<u32> thread, u32 group_id, u32 spu_
|
||||
sys_spu.todo("Unimplemented SPU Thread options (0x%x)", option);
|
||||
}
|
||||
|
||||
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x40000, vm::main))};
|
||||
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))};
|
||||
|
||||
const u32 tid = idm::import<named_thread<spu_thread>>([&]()
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user