diff --git a/3rdparty/MoltenVK/CMakeLists.txt b/3rdparty/MoltenVK/CMakeLists.txt index 5fb379c642..6b16c23a93 100644 --- a/3rdparty/MoltenVK/CMakeLists.txt +++ b/3rdparty/MoltenVK/CMakeLists.txt @@ -7,9 +7,8 @@ ExternalProject_Add(moltenvk GIT_TAG 1236d2f BUILD_IN_SOURCE 1 SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK - PATCH_COMMAND git apply "${CMAKE_CURRENT_SOURCE_DIR}/patches.patch" CONFIGURE_COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/fetchDependencies" --macos - BUILD_COMMAND xcodebuild build -quiet -project "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVKPackaging.xcodeproj" -scheme "MoltenVK Package \(macOS only\)" -configuration "Release" -arch "x86_64" + BUILD_COMMAND xcodebuild build -quiet -project "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVKPackaging.xcodeproj" -scheme "MoltenVK Package \(macOS only\)" -configuration "Release" -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}" COMMAND ln -f "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVK/dylib/macOS/libMoltenVK.dylib" "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/Build/Products/Release/dynamic/libMoltenVK.dylib" INSTALL_COMMAND "" BUILD_BYPRODUCTS "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/Build/Products/Release/dynamic/libMoltenVK.dylib" diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index aa5b82aa02..6b1de49eee 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -196,7 +196,11 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept ensure(!code->relocateToBase(uptr(p))); { + // We manage rw <-> rx transitions manually on Apple + // because it's easier to keep track of when and where we need to toggle W^X +#if !(defined(ARCH_ARM64) && defined(__APPLE__)) asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize); +#endif for (asmjit::Section* section : code->_sections) { @@ -248,6 +252,9 @@ void jit_runtime::initialize() void jit_runtime::finalize() noexcept { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif // Reset JIT memory #ifdef CAN_OVERCOMMIT utils::memory_reset(get_jit_memory(), 0x80000000); @@ -262,6 +269,15 @@ void jit_runtime::finalize() noexcept // Restore code/data snapshot std::memcpy(alloc(s_code_init.size(), 1, true), s_code_init.data(), s_code_init.size()); std::memcpy(alloc(s_data_init.size(), 1, false), s_data_init.data(), s_data_init.size()); + +#ifdef __APPLE__ + pthread_jit_write_protect_np(true); +#endif +#ifdef ARCH_ARM64 + // Flush all cache lines after potentially writing executable code + asm("ISB"); + asm("DSB ISH"); +#endif } jit_runtime_base& asmjit::get_global_runtime() @@ -432,6 +448,21 @@ static u64 make_null_function(const std::string& name) c.db(ch); c.db(0); c.align(AlignMode::kData, 16); +#else + // AArch64 implementation + Label jmp_address = c.newLabel(); + Label data = c.newLabel(); + // Force absolute jump to prevent out of bounds PC-rel jmp + c.ldr(args[0], arm::ptr(jmp_address)); + c.br(args[0]); + c.align(AlignMode::kCode, 16); + + c.bind(data); + c.embed(name.c_str(), name.size()); + c.embedUInt8(0U); + c.bind(jmp_address); + c.embedUInt64(reinterpret_cast(&null)); + c.align(AlignMode::kData, 16); #endif }); @@ -840,6 +871,7 @@ jit_compiler::jit_compiler(const std::unordered_map& _link, co std::string result; auto null_mod = std::make_unique ("null_", *m_context); + null_mod->setTargetTriple(utils::c_llvm_default_triple); if (_link.empty()) { @@ -852,7 +884,7 @@ jit_compiler::jit_compiler(const std::unordered_map& _link, co else { mem = std::make_unique(); - null_mod->setTargetTriple(llvm::Triple::normalize("x86_64-unknown-linux-gnu")); + null_mod->setTargetTriple(utils::c_llvm_default_triple); } // Auxiliary JIT (does not use custom memory manager, only writes the objects) diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 4c70785f2e..33a7317091 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -269,6 +269,9 @@ namespace asmjit template inline FT build_function_asm(std::string_view name, F&& builder) { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif using namespace asmjit; auto& rt = get_global_runtime(); diff --git a/llvm b/llvm index c725f494c9..5521155be5 160000 --- a/llvm +++ b/llvm @@ -1 +1 @@ -Subproject commit c725f494c91611018f5d830eca22c0a1662c0f31 +Subproject commit 5521155be5c869b0b760e1dec86c41cdbb7a75c0 diff --git a/rpcs3/CMakeLists.txt b/rpcs3/CMakeLists.txt index de1dfb7011..ff02cc95c3 100644 --- a/rpcs3/CMakeLists.txt +++ b/rpcs3/CMakeLists.txt @@ -138,6 +138,11 @@ find_program(MACDEPLOYQT_EXECUTABLE macdeployqt HINTS "${_qt_bin_dir}") # Copy icons to executable directory if(APPLE) + if (CMAKE_BUILD_TYPE MATCHES "Debug" OR CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") + set(QT_DEPLOY_FLAGS "-no-strip") + else() + set(QT_DEPLOY_FLAGS "") + endif() add_custom_command(TARGET rpcs3 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${RPCS3_SRC_DIR}/rpcs3.icns $/../Resources/rpcs3.icns @@ -147,7 +152,7 @@ if(APPLE) ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $/../Resources/GuiConfigs COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/git $/../Resources/git - COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app") + COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app" "${QT_DEPLOY_FLAGS}") elseif(UNIX) add_custom_command(TARGET rpcs3 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 3c43eb87a2..a140c83a79 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -21,6 +21,8 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/IntrinsicsAArch64.h" + #ifdef _MSC_VER #pragma warning(pop) #else @@ -2894,8 +2896,12 @@ protected: bool m_is_be; // Allow PSHUFB intrinsic +#ifdef ARCH_X64 bool m_use_ssse3 = true; - +#else + // TODO: fix the pshufb arm64 native impl using TBL instruction + bool m_use_ssse3 = false; +#endif // Allow FMA bool m_use_fma = false; @@ -3640,25 +3646,41 @@ public: template , f32[4]>>> static auto fre(T&& a) { +#if defined(ARCH_X64) return llvm_calli{"llvm.x86.sse.rcp.ps", {std::forward(a)}}; +#elif defined(ARCH_ARM64) + return llvm_calli{"llvm.aarch64.neon.frecpe.v4f32", {std::forward(a)}}; +#endif } template , f32[4]>>> static auto frsqe(T&& a) { +#if defined(ARCH_X64) return llvm_calli{"llvm.x86.sse.rsqrt.ps", {std::forward(a)}}; +#elif defined(ARCH_ARM64) + return llvm_calli{"llvm.aarch64.neon.frsqrte.v4f32", {std::forward(a)}}; +#endif } template , f32[4]>>> static auto fmax(T&& a, U&& b) { +#if defined(ARCH_X64) return llvm_calli{"llvm.x86.sse.max.ps", {std::forward(a), std::forward(b)}}; +#elif defined(ARCH_ARM64) + return llvm_calli{"llvm.aarch64.neon.fmax.v4f32", {std::forward(a), std::forward(b)}}; +#endif } template , f32[4]>>> static auto fmin(T&& a, U&& b) { +#if defined(ARCH_X64) return llvm_calli{"llvm.x86.sse.min.ps", {std::forward(a), std::forward(b)}}; +#elif defined(ARCH_ARM64) + return llvm_calli{"llvm.aarch64.neon.fmin.v4f32", {std::forward(a), std::forward(b)}}; +#endif } template , u8[16]>>> diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 000c9bf62b..e25a9dd51f 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -65,6 +65,10 @@ #include "util/simd.hpp" #include "util/sysinfo.hpp" +#ifdef __APPLE__ +#include +#endif + extern atomic_t g_watchdog_hold_ctr; // Should be of the same type @@ -247,7 +251,104 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", c.ret(); #else + // See https://github.com/ghc/ghc/blob/master/rts/include/stg/MachRegs.h + // for GHC calling convention definitions on Aarch64 + // and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers + // for AArch64 calling convention + + // Push callee saved registers to the stack + // We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B + c.sub(a64::sp, a64::sp, Imm(112)); + c.stp(a64::x18, a64::x19, arm::Mem(a64::sp)); + c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); + c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); + c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); + c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); + c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); + c.str(a64::x30, arm::Mem(a64::sp, 96)); + + // Save sp for native longjmp emulation + Label native_sp_offset = c.newLabel(); + c.ldr(a64::x10, arm::Mem(native_sp_offset)); + c.str(a64::sp, arm::Mem(args[0], a64::x10)); + + // Load REG_Base - use absolute jump target to bypass rel jmp range limits + Label exec_addr = c.newLabel(); + c.ldr(a64::x19, arm::Mem(exec_addr)); + c.ldr(a64::x19, arm::Mem(a64::x19)); + // Load PPUThread struct base -> REG_Sp + const arm::GpX ppu_t_base = a64::x20; + c.mov(ppu_t_base, args[0]); + // Load PC + const arm::GpX pc = a64::x26; + Label cia_offset = c.newLabel(); + const arm::GpX cia_addr_reg = a64::x11; + // Load offset value + c.ldr(cia_addr_reg, arm::Mem(cia_offset)); + // Load cia + c.ldr(pc, arm::Mem(ppu_t_base, cia_addr_reg)); + // Zero top 32 bits + c.mov(a64::w26, a64::w26); + // Multiply by 2 to index into ptr table + const arm::GpX index_shift = a64::x27; + c.mov(index_shift, Imm(2)); + c.mul(pc, pc, index_shift); + + // Load call target + const arm::GpX call_target = a64::x28; + c.ldr(call_target, arm::Mem(a64::x19, pc)); + // Compute REG_Hp + const arm::GpX reg_hp = a64::x21; + c.mov(reg_hp, call_target); + c.lsr(reg_hp, reg_hp, 48); + c.lsl(reg_hp, reg_hp, 13); + + // Zero top 16 bits of call target + c.lsl(call_target, call_target, Imm(16)); + c.lsr(call_target, call_target, Imm(16)); + + // Load registers + Label base_addr = c.newLabel(); + c.ldr(a64::x22, arm::Mem(base_addr)); + c.ldr(a64::x22, arm::Mem(a64::x22)); + + Label gpr_addr_offset = c.newLabel(); + const arm::GpX gpr_addr_reg = a64::x9; + c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset)); + c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base); + c.ldr(a64::x23, arm::Mem(gpr_addr_reg)); + c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8)); + c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16)); + + // Execute LLE call + c.blr(call_target); + + // Restore stack ptr + c.ldr(a64::x10, arm::Mem(native_sp_offset)); + c.ldr(a64::sp, arm::Mem(args[0], a64::x10)); + // Restore registers from the stack + c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp)); + c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); + c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); + c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); + c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); + c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); + c.ldr(a64::x30, arm::Mem(a64::sp, 96)); + // Restore stack ptr + c.add(a64::sp, a64::sp, Imm(112)); + // Return c.ret(a64::x30); + + c.bind(exec_addr); + c.embedUInt64(reinterpret_cast(&vm::g_exec_addr)); + c.bind(base_addr); + c.embedUInt64(reinterpret_cast(&vm::g_base_addr)); + c.bind(cia_offset); + c.embedUInt64(static_cast(::offset32(&ppu_thread::cia))); + c.bind(gpr_addr_offset); + c.embedUInt64(static_cast(::offset32(&ppu_thread::gpr))); + c.bind(native_sp_offset); + c.embedUInt64(static_cast(::offset32(&ppu_thread::saved_native_sp))); #endif }); @@ -1252,6 +1353,9 @@ void ppu_thread::cpu_task() } case ppu_cmd::initialize: { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif cmd_pop(); while (!g_fxo->get().is_inited && !is_stopped()) @@ -1267,6 +1371,15 @@ void ppu_thread::cpu_task() thread_ctrl::wait_on(g_progr_ptotal, 0); g_fxo->get().skip_the_progress_dialog = true; +#ifdef __APPLE__ + pthread_jit_write_protect_np(true); +#endif +#ifdef ARCH_ARM64 + // Flush all cache lines after potentially writing executable code + asm("ISB"); + asm("DSB ISH"); +#endif + break; } case ppu_cmd::sleep: @@ -1396,6 +1509,15 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3 { call_history.data.resize(call_history_max_size); } + +#ifdef __APPLE__ + pthread_jit_write_protect_np(true); +#endif +#ifdef ARCH_ARM64 + // Flush all cache lines after potentially writing executable code + asm("ISB"); + asm("DSB ISH"); +#endif } ppu_thread::thread_name_t::operator std::string() const @@ -1974,6 +2096,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm& dir_queue, std::vector(utils::get_thread_count(), ::size32(file_queue)), [&] { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif // Set low priority thread_ctrl::scoped_priority low_prio(-1); @@ -3226,6 +3353,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only) // Set low priority thread_ctrl::scoped_priority low_prio(-1); +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++) { if (Emu.IsStopped()) @@ -3287,6 +3417,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only) } // Jit can be null if the loop doesn't ever enter. +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif if (jit && !jit_mod.init) { jit->fin(); @@ -3345,7 +3478,12 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co std::unique_ptr _module = std::make_unique(obj_name, jit.get_context()); // Initialize target +#if defined(__APPLE__) && defined(ARCH_ARM64) + // Force target linux on macOS arm64 to bypass some 64-bit address space linking issues + _module->setTargetTriple(utils::c_llvm_default_triple); +#else _module->setTargetTriple(Triple::normalize(sys::getProcessTriple())); +#endif _module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout()); // Initialize translator diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 48decf175c..50de63ec9a 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -197,7 +197,7 @@ Function* PPUTranslator::Translate(const ppu_function& info) // Create tail call to the check function m_ir->SetInsertPoint(vcheck); - Call(GetType(), "__check", m_thread, GetAddr())->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), "__check", m_thread, GetAddr()); m_ir->CreateRetVoid(); } else @@ -1948,13 +1948,13 @@ void PPUTranslator::SC(ppu_opcode_t op) if (index < 1024) { - Call(GetType(), fmt::format("%s", ppu_syscall_code(index)), m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), fmt::format("%s", ppu_syscall_code(index)), m_thread); m_ir->CreateRetVoid(); return; } } - Call(GetType(), op.lev ? "__lv1call" : "__syscall", m_thread, num)->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), op.lev ? "__lv1call" : "__syscall", m_thread, num); m_ir->CreateRetVoid(); } @@ -2506,7 +2506,7 @@ void PPUTranslator::LWARX(ppu_opcode_t op) { RegStore(Trunc(GetAddr()), m_cia); FlushRegisters(); - Call(GetType(), "__resinterp", m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), "__resinterp", m_thread); m_ir->CreateRetVoid(); return; } @@ -2648,7 +2648,7 @@ void PPUTranslator::LDARX(ppu_opcode_t op) { RegStore(Trunc(GetAddr()), m_cia); FlushRegisters(); - Call(GetType(), "__resinterp", m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), "__resinterp", m_thread); m_ir->CreateRetVoid(); return; } @@ -4246,7 +4246,11 @@ void PPUTranslator::FCTIW(ppu_opcode_t op) const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType(), std::exp2l(31.))), GetType()); // fix result saturation (0x80000000 -> 0x7fffffff) +#if defined(ARCH_X64) SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.x86.sse2.cvtsd2si", m_ir->CreateInsertElement(GetUndef(), b, u64{0})))); +#elif defined(ARCH_ARM64) + SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.aarch64.neon.fcvtns.i32.f64", b))); +#endif //SetFPSCR_FR(Call(GetType(), m_pure_attr, "__fctiw_get_fr", b)); //SetFPSCR_FI(Call(GetType(), m_pure_attr, "__fctiw_get_fi", b)); @@ -4262,7 +4266,11 @@ void PPUTranslator::FCTIWZ(ppu_opcode_t op) const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType(), std::exp2l(31.))), GetType()); // fix result saturation (0x80000000 -> 0x7fffffff) +#if defined(ARCH_X64) SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.x86.sse2.cvttsd2si", m_ir->CreateInsertElement(GetUndef(), b, u64{0})))); +#elif defined(ARCH_ARM64) + SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.aarch64.neon.fcvtzs.i32.f64", b))); +#endif } void PPUTranslator::FDIV(ppu_opcode_t op) @@ -4538,7 +4546,12 @@ void PPUTranslator::FCTID(ppu_opcode_t op) const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType(), std::exp2l(63.))), GetType()); // fix result saturation (0x8000000000000000 -> 0x7fffffffffffffff) +#if defined(ARCH_X64) SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.x86.sse2.cvtsd2si64", m_ir->CreateInsertElement(GetUndef(), b, u64{0})))); +#elif defined(ARCH_ARM64) + SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.aarch64.neon.fcvtns.i64.f64", b))); +#endif + //SetFPSCR_FR(Call(GetType(), m_pure_attr, "__fctid_get_fr", b)); //SetFPSCR_FI(Call(GetType(), m_pure_attr, "__fctid_get_fi", b)); @@ -4554,7 +4567,11 @@ void PPUTranslator::FCTIDZ(ppu_opcode_t op) const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType(), std::exp2l(63.))), GetType()); // fix result saturation (0x8000000000000000 -> 0x7fffffffffffffff) +#if defined(ARCH_X64) SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.x86.sse2.cvttsd2si64", m_ir->CreateInsertElement(GetUndef(), b, u64{0})))); +#elif defined(ARCH_ARM64) + SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType(), "llvm.aarch64.neon.fcvtzs.i64.f64", b))); +#endif } void PPUTranslator::FCFID(ppu_opcode_t op) @@ -4571,7 +4588,7 @@ void PPUTranslator::FCFID(ppu_opcode_t op) void PPUTranslator::UNK(ppu_opcode_t op) { FlushRegisters(); - Call(GetType(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode))->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode)); m_ir->CreateRetVoid(); } @@ -4832,7 +4849,7 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right) void PPUTranslator::Trap() { - Call(GetType(), "__trap", m_thread, GetAddr())->setTailCallKind(llvm::CallInst::TCK_Tail); + Call(GetType(), "__trap", m_thread, GetAddr()); m_ir->CreateRetVoid(); } diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 95f4bb3f6f..b4beb750a6 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -34,6 +34,15 @@ const extern spu_decoder g_spu_iflag; // Move 4 args for calling native function from a GHC calling convention function static u8* move_args_ghc_to_native(u8* raw) { +#ifdef ARCH_ARM64 + // Note: this is a placeholder to get rpcs3 working for now + // mov x0, x22 + // mov x1, x23 + // mov x2, x24 + // mov x3, x25 + std::memcpy(raw, "\xE0\x03\x16\xAA\xE1\x03\x17\xAA\xE2\x03\x18\xAA\xE3\x03\x19\xAA", 16); + return raw + 16; +#else #ifdef _WIN32 // mov rcx, r13 // mov rdx, rbp @@ -49,10 +58,14 @@ static u8* move_args_ghc_to_native(u8* raw) #endif return raw + 12; +#endif } DECLARE(spu_runtime::tr_dispatch) = [] { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif // Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction u8* const trptr = jit_runtime::alloc(32, 16); u8* raw = move_args_ghc_to_native(trptr); @@ -439,6 +452,9 @@ void spu_cache::initialize() named_thread_group workers("SPU Worker ", worker_count, [&]() -> uint { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif // Set low priority thread_ctrl::scoped_priority low_prio(-1); @@ -4412,7 +4428,7 @@ public: // Create LLVM module std::unique_ptr _module = std::make_unique(m_hash + ".obj", m_context); - _module->setTargetTriple(Triple::normalize("x86_64-unknown-linux-gnu")); + _module->setTargetTriple(utils::c_llvm_default_triple); _module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout()); m_module = _module.get(); @@ -4672,7 +4688,12 @@ public: // Function that executes check_state and escapes if necessary m_test_state = llvm::cast(m_module->getOrInsertFunction("spu_test_state", get_ftype()).getCallee()); m_test_state->setLinkage(GlobalValue::InternalLinkage); +#ifdef ARCH_ARM64 + // LLVM doesn't support PreserveAll on arm64. + m_test_state->setCallingConv(CallingConv::GHC); +#else m_test_state->setCallingConv(CallingConv::PreserveAll); +#endif m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", m_test_state)); const auto escape_yes = BasicBlock::Create(m_context, "", m_test_state); const auto escape_no = BasicBlock::Create(m_context, "", m_test_state); @@ -5069,7 +5090,7 @@ public: // Create LLVM module std::unique_ptr _module = std::make_unique("spu_interpreter.obj", m_context); - _module->setTargetTriple(Triple::normalize("x86_64-unknown-linux-gnu")); + _module->setTargetTriple(utils::c_llvm_default_triple); _module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout()); m_module = _module.get(); @@ -5114,7 +5135,11 @@ public: // Save host thread's stack pointer const auto native_sp = spu_ptr(&spu_thread::saved_native_sp); +#if defined(ARCH_X64) const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); +#elif defined(ARCH_ARM64) + const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "sp")})); +#endif m_ir->CreateStore(m_ir->CreateCall(get_intrinsic(Intrinsic::read_register), {rsp_name}), native_sp); // Decode (shift) and load function pointer @@ -5328,7 +5353,11 @@ public: else if (!(itype & spu_itype::branch)) { // Hack: inline ret instruction before final jmp; this is not reliable. +#ifdef ARCH_X64 m_ir->CreateCall(InlineAsm::get(get_ftype(), "ret", "", true, false, InlineAsm::AD_Intel)); +#else + m_ir->CreateCall(InlineAsm::get(get_ftype(), "ret", "", true, false)); +#endif fret = ret_func; } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index e85719d969..31f4a86432 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1414,6 +1414,9 @@ extern thread_local std::string(*g_tls_log_prefix)(); void spu_thread::cpu_task() { +#ifdef __APPLE__ + pthread_jit_write_protect_np(true); +#endif // Get next PC and SPU Interrupt status pc = status_npc.load().npc; diff --git a/rpcs3/Emu/Cell/lv2/lv2.cpp b/rpcs3/Emu/Cell/lv2/lv2.cpp index bc075957be..4add1f5b87 100644 --- a/rpcs3/Emu/Cell/lv2/lv2.cpp +++ b/rpcs3/Emu/Cell/lv2/lv2.cpp @@ -1149,8 +1149,16 @@ extern void ppu_execute_syscall(ppu_thread& ppu, u64 code) if (const auto func = g_ppu_syscall_table[code].first) { +#ifdef __APPLE__ + pthread_jit_write_protect_np(false); +#endif func(ppu, {}, vm::_ptr(ppu.cia), nullptr); ppu_log.trace("Syscall '%s' (%llu) finished, r3=0x%llx", ppu_syscall_code(code), code, ppu.gpr[3]); + +#ifdef __APPLE__ + pthread_jit_write_protect_np(true); + // No need to flush cache lines after a syscall, since we didn't generate any code. +#endif return; } } diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 70a5bfe304..baa2c3ccee 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -270,9 +270,13 @@ namespace #endif } +#if !defined(__APPLE__) || defined(ARCH_X64) DECLARE(copy_data_swap_u32) = build_function_asm("copy_data_swap_u32", &build_copy_data_swap_u32); - DECLARE(copy_data_swap_u32_cmp) = build_function_asm("copy_data_swap_u32_cmp", &build_copy_data_swap_u32); +#else +DECLARE(copy_data_swap_u32) = copy_data_swap_u32_naive; +DECLARE(copy_data_swap_u32_cmp) = copy_data_swap_u32_naive; +#endif namespace { diff --git a/rpcs3/util/sysinfo.hpp b/rpcs3/util/sysinfo.hpp index ef8c09c78c..ef14516741 100755 --- a/rpcs3/util/sysinfo.hpp +++ b/rpcs3/util/sysinfo.hpp @@ -67,4 +67,16 @@ namespace utils u32 get_rep_movsb_threshold(); extern const u64 main_tid; + +#ifdef LLVM_AVAILABLE + +#if defined(ARCH_X64) + const std::string c_llvm_default_triple = "x86_64-unknown-linux-gnu"; +#elif defined(ARCH_ARM64) + const std::string c_llvm_default_triple = "arm64-unknown-linux-gnu"; +#else + const std::string c_llvm_default_triple = "Unimplemented!" +#endif + +#endif } diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp index 1f4ed049f1..c08a9397a4 100644 --- a/rpcs3/util/vm_native.cpp +++ b/rpcs3/util/vm_native.cpp @@ -260,7 +260,11 @@ namespace utils size += 0x10000; } +#ifdef __APPLE__ + auto ptr = ::mmap(use_addr, size, PROT_NONE, MAP_ANON | MAP_PRIVATE | MAP_JIT | c_map_noreserve, -1, 0); +#else auto ptr = ::mmap(use_addr, size, PROT_NONE, MAP_ANON | MAP_PRIVATE | c_map_noreserve, -1, 0); +#endif if (ptr == reinterpret_cast(uptr{umax})) { @@ -333,7 +337,16 @@ namespace utils ensure(::VirtualFree(pointer, size, MEM_DECOMMIT)); #else const u64 ptr64 = reinterpret_cast(pointer); +#if defined(__APPLE__) && defined(ARCH_ARM64) + // Hack: on macOS, Apple explicitly fails mmap if you combine MAP_FIXED and MAP_JIT. + // So we unmap the space and just hope it maps to the same address we got before instead. + // The Xcode manpage says the pointer is a hint and the OS will try to map at the hint location + // so this isn't completely undefined behavior. + ensure(::munmap(pointer, size) != -1); + ensure(::mmap(pointer, size, PROT_NONE, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0) == pointer); +#else ensure(::mmap(pointer, size, PROT_NONE, MAP_FIXED | MAP_ANON | MAP_PRIVATE | c_map_noreserve, -1, 0) != reinterpret_cast(uptr{umax})); +#endif if constexpr (c_madv_no_dump != 0) { @@ -353,7 +366,12 @@ namespace utils memory_commit(pointer, size, prot); #else const u64 ptr64 = reinterpret_cast(pointer); +#if defined(__APPLE__) && defined(ARCH_ARM64) + ensure(::munmap(pointer, size) != -1); + ensure(::mmap(pointer, size, +prot, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0) == pointer); +#else ensure(::mmap(pointer, size, +prot, MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0) != reinterpret_cast(uptr{umax})); +#endif if constexpr (c_madv_hugepage != 0) {