PPU LLVM arm64+macOS port (#12115)

* BufferUtils: use naive function pointer on Apple arm64 Use naive function pointer on Apple arm64 because ASLR breaks asmjit. See BufferUtils.cpp comment for explanation on why this happens and how to fix if you want to use asmjit. * build-macos: fix source maps for Mac Tell Qt not to strip debug symbols when we're in debug or relwithdebinfo modes. * LLVM PPU: fix aarch64 on macOS Force MachO on macOS to fix LLVM being unable to patch relocations during codegen. Adds Aarch64 NEON intrinsics for x86 intrinsics used by PPUTranslator/Recompiler. * virtual memory: use 16k pages on aarch64 macOS Temporary hack to get things working by using 16k pages instead of 4k pages in VM emulation. * PPU/SPU: fix NEON intrinsics and compilation for arm64 macOS Fixes some intrinsics usage and patches usages of asmjit to properly emit absolute jmps so ASLR doesn't cause out of bounds rel jumps. Also patches the SPU recompiler to properly work on arm64 by telling LLVM to target arm64. * virtual memory: fix W^X toggles on macOS aarch64 Fixes W^X on macOS aarch64 by setting all JIT mmap'd regions to default to RW mode. For both SPU and PPU execution threads, when initialization finishes we toggle to RX mode. This exploits Apple's per-thread setting for RW/RX to let us be technically compliant with the OS's W^X enforcement while not needing to actually separate the memory allocated for code/data. * PPU: implement aarch64 specific functions Implements ppu_gateway for arm64 and patches LLVM initialization to use the correct triple. Adds some fixes for macOS W^X JIT restrictions when entering/exiting JITed code. * PPU: Mark rpcs3 calls as non-tail Strictly speaking, rpcs3 JIT -> C++ calls are not tail calls. If you call a function inside e.g. an L2 syscall, it will clobber LR on arm64 and subtly break returns in emulated code. Only JIT -> JIT "calls" should be tail. * macOS/arm64: compatibility fixes * vm: patch virtual memory for arm64 macOS Tag mmap calls with MAP_JIT to allow W^X on macOS. Fix mmap calls to existing mmap'd addresses that were tagged with MAP_JIT on macOS. Fix memory unmapping on 16K page machines with a hack to mark "unmapped" pages as RW. * PPU: remove wrong comment * PPU: fix a merge regression * vm: remove 16k page hacks * PPU: formatting fixes * PPU: fix arm64 null function assembly * ppu: clean up arch-specific instructions
2024-11-22 02:32:36 +01:00 · 2022-06-14 05:28:38 -07:00 · 2022-06-14 05:28:38 -07:00 · cefc37a553
commit cefc37a553
parent 264253757c
14 changed files with 306 additions and 16 deletions
--- a/3rdparty/MoltenVK/CMakeLists.txt
+++ b/3rdparty/MoltenVK/CMakeLists.txt
@ -7,9 +7,8 @@ ExternalProject_Add(moltenvk
 	GIT_TAG 1236d2f
 	BUILD_IN_SOURCE 1
 	SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK
-	PATCH_COMMAND git apply "${CMAKE_CURRENT_SOURCE_DIR}/patches.patch"
 	CONFIGURE_COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/fetchDependencies" --macos
-	BUILD_COMMAND xcodebuild build -quiet -project "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVKPackaging.xcodeproj" -scheme "MoltenVK Package \(macOS only\)" -configuration "Release" -arch "x86_64"
+	BUILD_COMMAND xcodebuild build -quiet -project "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVKPackaging.xcodeproj" -scheme "MoltenVK Package \(macOS only\)" -configuration "Release" -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}"
 	COMMAND ln -f "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVK/dylib/macOS/libMoltenVK.dylib" "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/Build/Products/Release/dynamic/libMoltenVK.dylib"
 	INSTALL_COMMAND ""
 	BUILD_BYPRODUCTS "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/Build/Products/Release/dynamic/libMoltenVK.dylib"
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -196,7 +196,11 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept
 	ensure(!code->relocateToBase(uptr(p)));

 	{
+		// We manage rw <-> rx transitions manually on Apple
+		// because it's easier to keep track of when and where we need to toggle W^X
+#if !(defined(ARCH_ARM64) && defined(__APPLE__))
 		asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
+#endif

 		for (asmjit::Section* section : code->_sections)
 		{
@ -248,6 +252,9 @@ void jit_runtime::initialize()

 void jit_runtime::finalize() noexcept
 {
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(false);
+#endif
 	// Reset JIT memory
 #ifdef CAN_OVERCOMMIT
 	utils::memory_reset(get_jit_memory(), 0x80000000);
@ -262,6 +269,15 @@ void jit_runtime::finalize() noexcept
 	// Restore code/data snapshot
 	std::memcpy(alloc(s_code_init.size(), 1, true), s_code_init.data(), s_code_init.size());
 	std::memcpy(alloc(s_data_init.size(), 1, false), s_data_init.data(), s_data_init.size());
+
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(true);
+#endif
+#ifdef ARCH_ARM64
+	// Flush all cache lines after potentially writing executable code
+	asm("ISB");
+	asm("DSB ISH");
+#endif
 }

 jit_runtime_base& asmjit::get_global_runtime()
@ -432,6 +448,21 @@ static u64 make_null_function(const std::string& name)
 				c.db(ch);
 			c.db(0);
 			c.align(AlignMode::kData, 16);
+#else
+			// AArch64 implementation
+			Label jmp_address = c.newLabel();
+			Label data = c.newLabel();
+			// Force absolute jump to prevent out of bounds PC-rel jmp
+			c.ldr(args[0], arm::ptr(jmp_address));
+			c.br(args[0]);
+			c.align(AlignMode::kCode, 16);
+
+			c.bind(data);
+			c.embed(name.c_str(), name.size());
+			c.embedUInt8(0U);
+			c.bind(jmp_address);
+			c.embedUInt64(reinterpret_cast<u64>(&null));
+			c.align(AlignMode::kData, 16);
 #endif
 		});

@ -840,6 +871,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 	std::string result;

 	auto null_mod = std::make_unique<llvm::Module> ("null_", *m_context);
+	null_mod->setTargetTriple(utils::c_llvm_default_triple);

 	if (_link.empty())
 	{
@ -852,7 +884,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
 		else
 		{
 			mem = std::make_unique<MemoryManager2>();
-			null_mod->setTargetTriple(llvm::Triple::normalize("x86_64-unknown-linux-gnu"));
+			null_mod->setTargetTriple(utils::c_llvm_default_triple);
 		}

 		// Auxiliary JIT (does not use custom memory manager, only writes the objects)
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -269,6 +269,9 @@ namespace asmjit
 template <typename FT, typename Asm = native_asm, typename F>
 inline FT build_function_asm(std::string_view name, F&& builder)
 {
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(false);
+#endif
 	using namespace asmjit;

 	auto& rt = get_global_runtime();
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit c725f494c91611018f5d830eca22c0a1662c0f31
+Subproject commit 5521155be5c869b0b760e1dec86c41cdbb7a75c0
--- a/rpcs3/CMakeLists.txt
+++ b/rpcs3/CMakeLists.txt
@ -138,6 +138,11 @@ find_program(MACDEPLOYQT_EXECUTABLE macdeployqt HINTS "${_qt_bin_dir}")

 # Copy icons to executable directory
 if(APPLE)
+    if (CMAKE_BUILD_TYPE MATCHES "Debug" OR CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
+        set(QT_DEPLOY_FLAGS "-no-strip")
+    else()
+        set(QT_DEPLOY_FLAGS "")
+    endif()
    add_custom_command(TARGET rpcs3 POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy
            ${RPCS3_SRC_DIR}/rpcs3.icns $<TARGET_FILE_DIR:rpcs3>/../Resources/rpcs3.icns
@ -147,7 +152,7 @@ if(APPLE)
            ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/../Resources/GuiConfigs
            COMMAND ${CMAKE_COMMAND} -E copy_directory
            ${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/../Resources/git
-            COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app")
+            COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app" "${QT_DEPLOY_FLAGS}")
 elseif(UNIX)
    add_custom_command(TARGET rpcs3 POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy_directory
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -21,6 +21,8 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+
 #ifdef _MSC_VER
 #pragma warning(pop)
 #else
@ -2894,8 +2896,12 @@ protected:
 	bool m_is_be;

 	// Allow PSHUFB intrinsic
+#ifdef ARCH_X64
 	bool m_use_ssse3 = true;
-
+#else
+	// TODO: fix the pshufb arm64 native impl using TBL instruction
+	bool m_use_ssse3 = false;
+#endif
 	// Allow FMA
 	bool m_use_fma = false;

@ -3640,25 +3646,41 @@ public:
 	template <typename T, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T>, f32[4]>>>
 	static auto fre(T&& a)
 	{
+#if defined(ARCH_X64)
 		return llvm_calli<f32[4], T>{"llvm.x86.sse.rcp.ps", {std::forward<T>(a)}};
+#elif defined(ARCH_ARM64)
+		return llvm_calli<f32[4], T>{"llvm.aarch64.neon.frecpe.v4f32", {std::forward<T>(a)}};
+#endif
 	}

 	template <typename T, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T>, f32[4]>>>
 	static auto frsqe(T&& a)
 	{
+#if defined(ARCH_X64)
 		return llvm_calli<f32[4], T>{"llvm.x86.sse.rsqrt.ps", {std::forward<T>(a)}};
+#elif defined(ARCH_ARM64)
+		return llvm_calli<f32[4], T>{"llvm.aarch64.neon.frsqrte.v4f32", {std::forward<T>(a)}};
+#endif
 	}

 	template <typename T, typename U, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T, U>, f32[4]>>>
 	static auto fmax(T&& a, U&& b)
 	{
+#if defined(ARCH_X64)
 		return llvm_calli<f32[4], T, U>{"llvm.x86.sse.max.ps", {std::forward<T>(a), std::forward<U>(b)}};
+#elif defined(ARCH_ARM64)
+		return llvm_calli<f32[4], T, U>{"llvm.aarch64.neon.fmax.v4f32", {std::forward<T>(a), std::forward<U>(b)}};
+#endif
 	}

 	template <typename T, typename U, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T, U>, f32[4]>>>
 	static auto fmin(T&& a, U&& b)
 	{
+#if defined(ARCH_X64)
 		return llvm_calli<f32[4], T, U>{"llvm.x86.sse.min.ps", {std::forward<T>(a), std::forward<U>(b)}};
+#elif defined(ARCH_ARM64)
+		return llvm_calli<f32[4], T, U>{"llvm.aarch64.neon.fmin.v4f32", {std::forward<T>(a), std::forward<U>(b)}};
+#endif
 	}

 	template <typename T, typename U, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T, U>, u8[16]>>>
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -65,6 +65,10 @@
 #include "util/simd.hpp"
 #include "util/sysinfo.hpp"

+#ifdef __APPLE__
+#include <libkern/OSCacheControl.h>
+#endif
+
 extern atomic_t<u64> g_watchdog_hold_ctr;

 // Should be of the same type
@ -247,7 +251,104 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",

 	c.ret();
 #else
+	// See https://github.com/ghc/ghc/blob/master/rts/include/stg/MachRegs.h
+	// for GHC calling convention definitions on Aarch64
+	// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
+	// for AArch64 calling convention
+
+	// Push callee saved registers to the stack
+	// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
+	c.sub(a64::sp, a64::sp, Imm(112));
+	c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
+	c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
+	c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
+	c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
+	c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
+	c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
+	c.str(a64::x30, arm::Mem(a64::sp, 96));
+
+	// Save sp for native longjmp emulation
+	Label native_sp_offset = c.newLabel();
+	c.ldr(a64::x10, arm::Mem(native_sp_offset));
+	c.str(a64::sp, arm::Mem(args[0], a64::x10));
+
+	// Load REG_Base - use absolute jump target to bypass rel jmp range limits
+	Label exec_addr = c.newLabel();
+	c.ldr(a64::x19, arm::Mem(exec_addr));
+	c.ldr(a64::x19, arm::Mem(a64::x19));
+	// Load PPUThread struct base -> REG_Sp
+	const arm::GpX ppu_t_base = a64::x20;
+	c.mov(ppu_t_base, args[0]);
+	// Load PC
+	const arm::GpX pc = a64::x26;
+	Label cia_offset = c.newLabel();
+	const arm::GpX cia_addr_reg = a64::x11;
+	// Load offset value
+	c.ldr(cia_addr_reg, arm::Mem(cia_offset));
+	// Load cia
+	c.ldr(pc, arm::Mem(ppu_t_base, cia_addr_reg));
+	// Zero top 32 bits
+	c.mov(a64::w26, a64::w26);
+	// Multiply by 2 to index into ptr table
+	const arm::GpX index_shift = a64::x27;
+	c.mov(index_shift, Imm(2));
+	c.mul(pc, pc, index_shift);
+
+	// Load call target
+	const arm::GpX call_target = a64::x28;
+	c.ldr(call_target, arm::Mem(a64::x19, pc));
+	// Compute REG_Hp
+	const arm::GpX reg_hp = a64::x21;
+	c.mov(reg_hp, call_target);
+	c.lsr(reg_hp, reg_hp, 48);
+	c.lsl(reg_hp, reg_hp, 13);
+
+	// Zero top 16 bits of call target
+	c.lsl(call_target, call_target, Imm(16));
+	c.lsr(call_target, call_target, Imm(16));
+
+	// Load registers
+	Label base_addr = c.newLabel();
+	c.ldr(a64::x22, arm::Mem(base_addr));
+	c.ldr(a64::x22, arm::Mem(a64::x22));
+
+	Label gpr_addr_offset = c.newLabel();
+	const arm::GpX gpr_addr_reg = a64::x9;
+	c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset));
+	c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
+	c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
+	c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
+	c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));
+
+	// Execute LLE call
+	c.blr(call_target);
+
+	// Restore stack ptr
+	c.ldr(a64::x10, arm::Mem(native_sp_offset));
+	c.ldr(a64::sp, arm::Mem(args[0], a64::x10));
+	// Restore registers from the stack
+	c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
+	c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
+	c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
+	c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
+	c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
+	c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
+	c.ldr(a64::x30, arm::Mem(a64::sp, 96));
+	// Restore stack ptr
+	c.add(a64::sp, a64::sp, Imm(112));
+	// Return
 	c.ret(a64::x30);
+
+	c.bind(exec_addr);
+	c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr));
+	c.bind(base_addr);
+	c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
+	c.bind(cia_offset);
+	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
+	c.bind(gpr_addr_offset);
+	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr)));
+	c.bind(native_sp_offset);
+	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp)));
 #endif
 });

@ -1252,6 +1353,9 @@ void ppu_thread::cpu_task()
 		}
 		case ppu_cmd::initialize:
 		{
+#ifdef __APPLE__
+			pthread_jit_write_protect_np(false);
+#endif
 			cmd_pop();

 			while (!g_fxo->get<rsx::thread>().is_inited && !is_stopped())
@ -1267,6 +1371,15 @@ void ppu_thread::cpu_task()
 			thread_ctrl::wait_on<atomic_wait::op_ne>(g_progr_ptotal, 0);
 			g_fxo->get<progress_dialog_workaround>().skip_the_progress_dialog = true;

+#ifdef __APPLE__
+			pthread_jit_write_protect_np(true);
+#endif
+#ifdef ARCH_ARM64
+			// Flush all cache lines after potentially writing executable code
+			asm("ISB");
+			asm("DSB ISH");
+#endif
+
 			break;
 		}
 		case ppu_cmd::sleep:
@ -1396,6 +1509,15 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
 	{
 		call_history.data.resize(call_history_max_size);
 	}
+
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(true);
+#endif
+#ifdef ARCH_ARM64
+	// Flush all cache lines after potentially writing executable code
+	asm("ISB");
+	asm("DSB ISH");
+#endif
 }

 ppu_thread::thread_name_t::operator std::string() const
@ -1974,6 +2096,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
 #endif
 	c.ret();
 #else
+	// Unimplemented should fail.
+	c.brk(Imm(0x42));
 	c.ret(a64::x30);
 #endif
 });
@ -2552,6 +2676,9 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<lv2_

 	named_thread_group workers("SPRX Worker ", std::min<u32>(utils::get_thread_count(), ::size32(file_queue)), [&]
 	{
+#ifdef __APPLE__
+		pthread_jit_write_protect_np(false);
+#endif
 		// Set low priority
 		thread_ctrl::scoped_priority low_prio(-1);

@ -3226,6 +3353,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 			// Set low priority
 			thread_ctrl::scoped_priority low_prio(-1);

+#ifdef __APPLE__
+			pthread_jit_write_protect_np(false);
+#endif
 			for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++)
 			{
 				if (Emu.IsStopped())
@ -3287,6 +3417,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
 	}

 	// Jit can be null if the loop doesn't ever enter.
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(false);
+#endif
 	if (jit && !jit_mod.init)
 	{
 		jit->fin();
@ -3345,7 +3478,12 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 	std::unique_ptr<Module> _module = std::make_unique<Module>(obj_name, jit.get_context());

 	// Initialize target
+#if defined(__APPLE__) && defined(ARCH_ARM64)
+	// Force target linux on macOS arm64 to bypass some 64-bit address space linking issues
+	_module->setTargetTriple(utils::c_llvm_default_triple);
+#else
 	_module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
+#endif
 	_module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout());

 	// Initialize translator
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -197,7 +197,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)

 		// Create tail call to the check function
 		m_ir->SetInsertPoint(vcheck);
-		Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCallKind(llvm::CallInst::TCK_Tail);
+		Call(GetType<void>(), "__check", m_thread, GetAddr());
 		m_ir->CreateRetVoid();
 	}
 	else
@ -1948,13 +1948,13 @@ void PPUTranslator::SC(ppu_opcode_t op)

 		if (index < 1024)
 		{
-			Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail);
+			Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
 			m_ir->CreateRetVoid();
 			return;
 		}
 	}

-	Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num)->setTailCallKind(llvm::CallInst::TCK_Tail);
+	Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
 	m_ir->CreateRetVoid();
 }

@ -2506,7 +2506,7 @@ void PPUTranslator::LWARX(ppu_opcode_t op)
 	{
 		RegStore(Trunc(GetAddr()), m_cia);
 		FlushRegisters();
-		Call(GetType<void>(), "__resinterp", m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail);
+		Call(GetType<void>(), "__resinterp", m_thread);
 		m_ir->CreateRetVoid();
 		return;
 	}
@ -2648,7 +2648,7 @@ void PPUTranslator::LDARX(ppu_opcode_t op)
 	{
 		RegStore(Trunc(GetAddr()), m_cia);
 		FlushRegisters();
-		Call(GetType<void>(), "__resinterp", m_thread)->setTailCallKind(llvm::CallInst::TCK_Tail);
+		Call(GetType<void>(), "__resinterp", m_thread);
 		m_ir->CreateRetVoid();
 		return;
 	}
@ -4246,7 +4246,11 @@ void PPUTranslator::FCTIW(ppu_opcode_t op)
 	const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType<f64>(), std::exp2l(31.))), GetType<s32>());

 	// fix result saturation (0x80000000 -> 0x7fffffff)
+#if defined(ARCH_X64)
 	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s32>(), "llvm.x86.sse2.cvtsd2si", m_ir->CreateInsertElement(GetUndef<f64[2]>(), b, u64{0}))));
+#elif defined(ARCH_ARM64)
+	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s32>(), "llvm.aarch64.neon.fcvtns.i32.f64", b)));
+#endif

 	//SetFPSCR_FR(Call(GetType<bool>(), m_pure_attr, "__fctiw_get_fr", b));
 	//SetFPSCR_FI(Call(GetType<bool>(), m_pure_attr, "__fctiw_get_fi", b));
@ -4262,7 +4266,11 @@ void PPUTranslator::FCTIWZ(ppu_opcode_t op)
 	const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType<f64>(), std::exp2l(31.))), GetType<s32>());

 	// fix result saturation (0x80000000 -> 0x7fffffff)
+#if defined(ARCH_X64)
 	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s32>(), "llvm.x86.sse2.cvttsd2si", m_ir->CreateInsertElement(GetUndef<f64[2]>(), b, u64{0}))));
+#elif defined(ARCH_ARM64)
+	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s32>(), "llvm.aarch64.neon.fcvtzs.i32.f64", b)));
+#endif
 }

 void PPUTranslator::FDIV(ppu_opcode_t op)
@ -4538,7 +4546,12 @@ void PPUTranslator::FCTID(ppu_opcode_t op)
 	const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType<f64>(), std::exp2l(63.))), GetType<s64>());

 	// fix result saturation (0x8000000000000000 -> 0x7fffffffffffffff)
+#if defined(ARCH_X64)
 	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s64>(), "llvm.x86.sse2.cvtsd2si64", m_ir->CreateInsertElement(GetUndef<f64[2]>(), b, u64{0}))));
+#elif defined(ARCH_ARM64)
+	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s64>(), "llvm.aarch64.neon.fcvtns.i64.f64", b)));
+#endif
+

 	//SetFPSCR_FR(Call(GetType<bool>(), m_pure_attr, "__fctid_get_fr", b));
 	//SetFPSCR_FI(Call(GetType<bool>(), m_pure_attr, "__fctid_get_fi", b));
@ -4554,7 +4567,11 @@ void PPUTranslator::FCTIDZ(ppu_opcode_t op)
 	const auto xormask = m_ir->CreateSExt(m_ir->CreateFCmpOGE(b, ConstantFP::get(GetType<f64>(), std::exp2l(63.))), GetType<s64>());

 	// fix result saturation (0x8000000000000000 -> 0x7fffffffffffffff)
+#if defined(ARCH_X64)
 	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s64>(), "llvm.x86.sse2.cvttsd2si64", m_ir->CreateInsertElement(GetUndef<f64[2]>(), b, u64{0}))));
+#elif defined(ARCH_ARM64)
+	SetFpr(op.frd, m_ir->CreateXor(xormask, Call(GetType<s64>(), "llvm.aarch64.neon.fcvtzs.i64.f64", b)));
+#endif
 }

 void PPUTranslator::FCFID(ppu_opcode_t op)
@ -4571,7 +4588,7 @@ void PPUTranslator::FCFID(ppu_opcode_t op)
 void PPUTranslator::UNK(ppu_opcode_t op)
 {
 	FlushRegisters();
-	Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode))->setTailCallKind(llvm::CallInst::TCK_Tail);
+	Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
 	m_ir->CreateRetVoid();
 }

@ -4832,7 +4849,7 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)

 void PPUTranslator::Trap()
 {
-	Call(GetType<void>(), "__trap", m_thread, GetAddr())->setTailCallKind(llvm::CallInst::TCK_Tail);
+	Call(GetType<void>(), "__trap", m_thread, GetAddr());
 	m_ir->CreateRetVoid();
 }

--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -34,6 +34,15 @@ const extern spu_decoder<spu_iflag> g_spu_iflag;
 // Move 4 args for calling native function from a GHC calling convention function
 static u8* move_args_ghc_to_native(u8* raw)
 {
+#ifdef ARCH_ARM64
+	// Note: this is a placeholder to get rpcs3 working for now
+	// mov x0, x22
+	// mov x1, x23
+	// mov x2, x24
+	// mov x3, x25
+	std::memcpy(raw, "\xE0\x03\x16\xAA\xE1\x03\x17\xAA\xE2\x03\x18\xAA\xE3\x03\x19\xAA", 16);
+	return raw + 16;
+#else
 #ifdef _WIN32
 	// mov  rcx, r13
 	// mov  rdx, rbp
@ -49,10 +58,14 @@ static u8* move_args_ghc_to_native(u8* raw)
 #endif

 	return raw + 12;
+#endif
 }

 DECLARE(spu_runtime::tr_dispatch) = []
 {
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(false);
+#endif
 	// Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction
 	u8* const trptr = jit_runtime::alloc(32, 16);
 	u8* raw = move_args_ghc_to_native(trptr);
@ -439,6 +452,9 @@ void spu_cache::initialize()

 	named_thread_group workers("SPU Worker ", worker_count, [&]() -> uint
 	{
+#ifdef __APPLE__
+		pthread_jit_write_protect_np(false);
+#endif
 		// Set low priority
 		thread_ctrl::scoped_priority low_prio(-1);

@ -4412,7 +4428,7 @@ public:

 		// Create LLVM module
 		std::unique_ptr<Module> _module = std::make_unique<Module>(m_hash + ".obj", m_context);
-		_module->setTargetTriple(Triple::normalize("x86_64-unknown-linux-gnu"));
+		_module->setTargetTriple(utils::c_llvm_default_triple);
 		_module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout());
 		m_module = _module.get();

@ -4672,7 +4688,12 @@ public:
 		// Function that executes check_state and escapes if necessary
 		m_test_state = llvm::cast<llvm::Function>(m_module->getOrInsertFunction("spu_test_state", get_ftype<void, u8*>()).getCallee());
 		m_test_state->setLinkage(GlobalValue::InternalLinkage);
+#ifdef ARCH_ARM64
+		// LLVM doesn't support PreserveAll on arm64.
+		m_test_state->setCallingConv(CallingConv::GHC);
+#else
 		m_test_state->setCallingConv(CallingConv::PreserveAll);
+#endif
 		m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", m_test_state));
 		const auto escape_yes = BasicBlock::Create(m_context, "", m_test_state);
 		const auto escape_no = BasicBlock::Create(m_context, "", m_test_state);
@ -5069,7 +5090,7 @@ public:

 		// Create LLVM module
 		std::unique_ptr<Module> _module = std::make_unique<Module>("spu_interpreter.obj", m_context);
-		_module->setTargetTriple(Triple::normalize("x86_64-unknown-linux-gnu"));
+		_module->setTargetTriple(utils::c_llvm_default_triple);
 		_module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout());
 		m_module = _module.get();

@ -5114,7 +5135,11 @@ public:

 		// Save host thread's stack pointer
 		const auto native_sp = spu_ptr<u64>(&spu_thread::saved_native_sp);
+#if defined(ARCH_X64)
 		const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")}));
+#elif defined(ARCH_ARM64)
+		const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "sp")}));
+#endif
 		m_ir->CreateStore(m_ir->CreateCall(get_intrinsic<u64>(Intrinsic::read_register), {rsp_name}), native_sp);

 		// Decode (shift) and load function pointer
@ -5328,7 +5353,11 @@ public:
 							else if (!(itype & spu_itype::branch))
 							{
 								// Hack: inline ret instruction before final jmp; this is not reliable.
+#ifdef ARCH_X64
 								m_ir->CreateCall(InlineAsm::get(get_ftype<void>(), "ret", "", true, false, InlineAsm::AD_Intel));
+#else
+								m_ir->CreateCall(InlineAsm::get(get_ftype<void>(), "ret", "", true, false));
+#endif
 								fret = ret_func;
 							}

--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -1414,6 +1414,9 @@ extern thread_local std::string(*g_tls_log_prefix)();

 void spu_thread::cpu_task()
 {
+#ifdef __APPLE__
+	pthread_jit_write_protect_np(true);
+#endif
 	// Get next PC and SPU Interrupt status
 	pc = status_npc.load().npc;

--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -1149,8 +1149,16 @@ extern void ppu_execute_syscall(ppu_thread& ppu, u64 code)

 		if (const auto func = g_ppu_syscall_table[code].first)
 		{
+#ifdef __APPLE__
+			pthread_jit_write_protect_np(false);
+#endif
 			func(ppu, {}, vm::_ptr<u32>(ppu.cia), nullptr);
 			ppu_log.trace("Syscall '%s' (%llu) finished, r3=0x%llx", ppu_syscall_code(code), code, ppu.gpr[3]);
+
+#ifdef __APPLE__
+			pthread_jit_write_protect_np(true);
+			// No need to flush cache lines after a syscall, since we didn't generate any code.
+#endif
 			return;
 		}
 	}
--- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
@ -270,9 +270,13 @@ namespace
 #endif
 }

+#if !defined(__APPLE__) || defined(ARCH_X64)
 DECLARE(copy_data_swap_u32) = build_function_asm<void(*)(u32*, const u32*, u32)>("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
-
 DECLARE(copy_data_swap_u32_cmp) = build_function_asm<bool(*)(u32*, const u32*, u32)>("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
+#else
+DECLARE(copy_data_swap_u32) = copy_data_swap_u32_naive<false>;
+DECLARE(copy_data_swap_u32_cmp) = copy_data_swap_u32_naive<true>;
+#endif

 namespace
 {
--- a/rpcs3/util/sysinfo.hpp
+++ b/rpcs3/util/sysinfo.hpp
@ -67,4 +67,16 @@ namespace utils
 	u32 get_rep_movsb_threshold();

 	extern const u64 main_tid;
+
+#ifdef LLVM_AVAILABLE
+
+#if defined(ARCH_X64)
+	const std::string c_llvm_default_triple = "x86_64-unknown-linux-gnu";
+#elif defined(ARCH_ARM64)
+	const std::string c_llvm_default_triple = "arm64-unknown-linux-gnu";
+#else
+	const std::string c_llvm_default_triple = "Unimplemented!"
+#endif
+
+#endif
 }
--- a/rpcs3/util/vm_native.cpp
+++ b/rpcs3/util/vm_native.cpp
@ -260,7 +260,11 @@ namespace utils
 			size += 0x10000;
 		}

+#ifdef __APPLE__
+		auto ptr = ::mmap(use_addr, size, PROT_NONE, MAP_ANON | MAP_PRIVATE | MAP_JIT | c_map_noreserve, -1, 0);
+#else
 		auto ptr = ::mmap(use_addr, size, PROT_NONE, MAP_ANON | MAP_PRIVATE | c_map_noreserve, -1, 0);
+#endif

 		if (ptr == reinterpret_cast<void*>(uptr{umax}))
 		{
@ -333,7 +337,16 @@ namespace utils
 		ensure(::VirtualFree(pointer, size, MEM_DECOMMIT));
 #else
 		const u64 ptr64 = reinterpret_cast<u64>(pointer);
+#if defined(__APPLE__) && defined(ARCH_ARM64)
+		// Hack: on macOS, Apple explicitly fails mmap if you combine MAP_FIXED and MAP_JIT.
+		// So we unmap the space and just hope it maps to the same address we got before instead.
+		// The Xcode manpage says the pointer is a hint and the OS will try to map at the hint location
+		// so this isn't completely undefined behavior.
+		ensure(::munmap(pointer, size) != -1);
+		ensure(::mmap(pointer, size, PROT_NONE,  MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0) == pointer);
+#else
 		ensure(::mmap(pointer, size, PROT_NONE, MAP_FIXED | MAP_ANON | MAP_PRIVATE | c_map_noreserve, -1, 0) != reinterpret_cast<void*>(uptr{umax}));
+#endif

 		if constexpr (c_madv_no_dump != 0)
 		{
@ -353,7 +366,12 @@ namespace utils
 		memory_commit(pointer, size, prot);
 #else
 		const u64 ptr64 = reinterpret_cast<u64>(pointer);
+#if defined(__APPLE__) && defined(ARCH_ARM64)
+		ensure(::munmap(pointer, size) != -1);
+		ensure(::mmap(pointer, size, +prot,  MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0) == pointer);
+#else
 		ensure(::mmap(pointer, size, +prot, MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0) != reinterpret_cast<void*>(uptr{umax}));
+#endif

 		if constexpr (c_madv_hugepage != 0)
 		{