diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index a58c29a8bb..6413053a0e 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -763,11 +763,12 @@ std::string jit_compiler::cpu(const std::string& _cpu) } jit_compiler::jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags) - : m_cpu(cpu(_cpu)) + : m_context(new llvm::LLVMContext) + , m_cpu(cpu(_cpu)) { std::string result; - auto null_mod = std::make_unique ("null_", m_context); + auto null_mod = std::make_unique ("null_", *m_context); if (_link.empty()) { diff --git a/Utilities/JIT.h b/Utilities/JIT.h index e90b039a67..85454c07e1 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -1,5 +1,7 @@ #pragma once +#include "util/types.hpp" + // Include asmjit with warnings ignored #define ASMJIT_EMBED #define ASMJIT_DEBUG @@ -27,6 +29,10 @@ #include #include +#include +#include +#include +#include enum class jit_class { @@ -251,43 +257,18 @@ public: #ifdef LLVM_AVAILABLE -#include -#include -#include -#include - -#include "util/types.hpp" - -#ifdef _MSC_VER -#pragma warning(push, 0) -#else -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wall" -#pragma GCC diagnostic ignored "-Wextra" -#pragma GCC diagnostic ignored "-Wold-style-cast" -#pragma GCC diagnostic ignored "-Wsuggest-override" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Weffc++" -#pragma GCC diagnostic ignored "-Wmissing-noreturn" -#ifdef __clang__ -#pragma clang diagnostic ignored "-Winconsistent-missing-override" -#endif -#endif -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#ifdef _MSC_VER -#pragma warning(pop) -#else -#pragma GCC diagnostic pop -#endif +namespace llvm +{ + class LLVMContext; + class ExecutionEngine; + class Module; +} // Temporary compiler interface class jit_compiler final { // Local LLVM context - llvm::LLVMContext m_context{}; + std::unique_ptr m_context{}; // Execution instance std::unique_ptr m_engine{}; @@ -302,7 +283,7 @@ public: // Get LLVM context auto& get_context() { - return m_context; + return *m_context; } auto& get_engine() const diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index b834fe8aed..9cd3d8cb30 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -15,6 +15,7 @@ #pragma GCC diagnostic ignored "-Wmissing-noreturn" #endif #include "llvm/IR/LLVMContext.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 3cdabd12ea..8e2f2e89f4 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -18,13 +18,22 @@ #endif #if defined(_MSC_VER) +#define PLAIN_FUNC #define SSSE3_FUNC #define SSE4_1_FUNC #define AVX2_FUNC +#define AVX3_FUNC #else +#ifndef __clang__ +#define PLAIN_FUNC __attribute__((optimize("no-tree-vectorize"))) +#define SSSE3_FUNC __attribute__((__target__("ssse3"))) __attribute__((optimize("tree-vectorize"))) +#else +#define PLAIN_FUNC #define SSSE3_FUNC __attribute__((__target__("ssse3"))) +#endif #define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) #define AVX2_FUNC __attribute__((__target__("avx2"))) +#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl"))) #ifndef __AVX2__ using __m256i = long long __attribute__((vector_size(32))); #endif @@ -45,22 +54,31 @@ SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x) return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1)))); } -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) constexpr bool s_use_ssse3 = true; constexpr bool s_use_sse4_1 = true; constexpr bool s_use_avx2 = true; +constexpr bool s_use_avx3 = true; +#elif defined(__AVX2__) +constexpr bool s_use_ssse3 = true; +constexpr bool s_use_sse4_1 = true; +constexpr bool s_use_avx2 = true; +constexpr bool s_use_avx3 = false; #elif defined(__SSE41__) constexpr bool s_use_ssse3 = true; constexpr bool s_use_sse4_1 = true; constexpr bool s_use_avx2 = false; +constexpr bool s_use_avx3 = false; #elif defined(__SSSE3__) constexpr bool s_use_ssse3 = true; constexpr bool s_use_sse4_1 = false; constexpr bool s_use_avx2 = false; +constexpr bool s_use_avx3 = false; #else const bool s_use_ssse3 = utils::has_ssse3(); const bool s_use_sse4_1 = utils::has_sse41(); const bool s_use_avx2 = utils::has_avx2(); +const bool s_use_avx3 = utils::has_avx512(); #endif const __m128i s_bswap_u32_mask = _mm_set_epi8( @@ -102,226 +120,69 @@ namespace X = X << 5; return{ X, Y, Z, 1 }; } -} -template -AVX2_FUNC inline bool copy_data_swap_u32_avx2(void*& dst, const void*& src, u32 count) -{ - const __m256i bswap_u32_mask = _mm256_set_m128i(s_bswap_u32_mask, s_bswap_u32_mask); - - __m128i diff0 = _mm_setzero_si128(); - __m256i diff = _mm256_setzero_si256(); - - if (uptr(dst) & 16 && count >= 4) + template + PLAIN_FUNC bool copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count) { - const auto dst0 = static_cast<__m128i*>(dst); - const auto src0 = static_cast(src); - const auto data = _mm_shuffle_epi8(_mm_loadu_si128(src0), s_bswap_u32_mask); - - if (Compare) - { - diff0 = _mm_xor_si128(data, _mm_load_si128(dst0)); - } - - _mm_store_si128(dst0, data); - dst = dst0 + 1; - src = src0 + 1; - count -= 4; - } - - const u32 lane_count = count / 8; - - auto dst_ptr = static_cast<__m256i*>(dst); - auto src_ptr = static_cast(src); + u32 result = 0; #ifdef __clang__ -#pragma clang loop unroll(disable) + #pragma clang loop vectorize(disable) interleave(disable) unroll(disable) #endif - for (u32 i = 0; i < lane_count; ++i) - { - const __m256i vec0 = _mm256_loadu_si256(src_ptr + i); - const __m256i vec1 = _mm256_shuffle_epi8(vec0, bswap_u32_mask); - - if constexpr (Compare) + for (u32 i = 0; i < count; i++) { - diff = _mm256_or_si256(diff, _mm256_xor_si256(vec1, _mm256_load_si256(dst_ptr + i))); - } - - _mm256_store_si256(dst_ptr + i, vec1); - } - - dst = dst_ptr + lane_count; - src = src_ptr + lane_count; - - if (count & 4) - { - const auto dst0 = static_cast<__m128i*>(dst); - const auto src0 = static_cast(src); - const auto data = _mm_shuffle_epi8(_mm_loadu_si128(src0), s_bswap_u32_mask); - - if (Compare) - { - diff0 = _mm_or_si128(diff0, _mm_xor_si128(data, _mm_load_si128(dst0))); - } - - _mm_store_si128(dst0, data); - dst = dst0 + 1; - src = src0 + 1; - } - - if constexpr (Compare) - { - diff = _mm256_or_si256(diff, _mm256_set_m128i(_mm_setzero_si128(), diff0)); - return !_mm256_testz_si256(diff, diff); - } - else - { - return false; - } -} - -template -static auto copy_data_swap_u32(void* dst, const void* src, u32 count) -{ - bool result = false; - - if (uptr(dst) & 4) - { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u32 data = stx::se_storage::swap(*src0); - - if (Compare && *dst0 != data) - { - result = true; - } - - *dst0 = data; - dst = dst0 + 1; - src = src0 + 1; - count--; - } - - if (uptr(dst) & 8 && count >= 2) - { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u64 data = utils::rol64(stx::se_storage::swap(*src0), 32); - - if (Compare && *dst0 != data) - { - result = true; - } - - *dst0 = data; - dst = dst0 + 1; - src = src0 + 1; - count -= 2; - } - - const u32 lane_count = count / 4; - - if (s_use_avx2) [[likely]] - { - result |= copy_data_swap_u32_avx2(dst, src, count); - } - else if (s_use_ssse3) - { - __m128i diff = _mm_setzero_si128(); - - auto dst_ptr = static_cast<__m128i*>(dst); - auto src_ptr = static_cast(src); - - for (u32 i = 0; i < lane_count; ++i) - { - const __m128i vec0 = _mm_loadu_si128(src_ptr + i); - const __m128i vec1 = ssse3_shuffle_epi8(vec0, s_bswap_u32_mask); + const u32 data = stx::se_storage::swap(src[i]); if constexpr (Compare) { - diff = _mm_or_si128(diff, _mm_xor_si128(vec1, _mm_load_si128(dst_ptr + i))); + result |= data ^ dst[i]; } - _mm_store_si128(dst_ptr + i, vec1); + dst[i] = data; } - result |= _mm_cvtsi128_si64(_mm_packs_epi32(diff, diff)) != 0; - - dst = dst_ptr + lane_count; - src = src_ptr + lane_count; + return static_cast(result); } - else + + template + SSSE3_FUNC bool copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count) { - __m128i diff = _mm_setzero_si128(); + u32 result = 0; - auto dst_ptr = static_cast<__m128i*>(dst); - auto src_ptr = static_cast(src); - - for (u32 i = 0; i < lane_count; ++i) +#ifdef __clang__ + #pragma clang loop vectorize(enable) interleave(disable) unroll(disable) +#endif + for (u32 i = 0; i < count; i++) { - const __m128i vec0 = _mm_loadu_si128(src_ptr + i); - const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); - const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); + const u32 data = stx::se_storage::swap(src[i]); if constexpr (Compare) { - diff = _mm_or_si128(diff, _mm_xor_si128(vec2, _mm_load_si128(dst_ptr + i))); + result |= data ^ dst[i]; } - _mm_store_si128(dst_ptr + i, vec2); + dst[i] = data; } - result |= _mm_cvtsi128_si64(_mm_packs_epi32(diff, diff)) != 0; - - dst = dst_ptr + lane_count; - src = src_ptr + lane_count; + return static_cast(result); } - if (count & 2) + template + void build_copy_data_swap_u32(asmjit::X86Assembler& c, std::array& args) { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u64 data = utils::rol64(stx::se_storage::swap(*src0), 32); - - if (Compare && *dst0 != data) + if (utils::has_ssse3()) { - result = true; + c.jmp(asmjit::imm_ptr(©_data_swap_u32_ssse3)); + return; } - *dst0 = data; - dst = dst0 + 1; - src = src0 + 1; - } - - if (count & 1) - { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u32 data = stx::se_storage::swap(*src0); - - if (Compare && *dst0 != data) - { - result = true; - } - - *dst0 = data; - } - - if constexpr (Compare) - { - return result; + c.jmp(asmjit::imm_ptr(©_data_swap_u32_naive)); } } -bool copy_data_swap_u32_cmp(void* dst, const void* src, u32 count) -{ - return copy_data_swap_u32(dst, src, count); -} +built_function copy_data_swap_u32(&build_copy_data_swap_u32); -void copy_data_swap_u32(void* dst, const void* src, u32 count) -{ - copy_data_swap_u32(dst, src, count); -} +built_function copy_data_swap_u32_cmp(&build_copy_data_swap_u32); namespace { diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index 5cef7472cf..84684c33e4 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -1,6 +1,7 @@ #pragma once #include "../gcm_enums.h" +#include "Utilities/JIT.h" #include @@ -56,7 +57,7 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w); void stream_vector_from_memory(void *dst, void *src); // Copy and swap data in 32-bit units -void copy_data_swap_u32(void* dst, const void* src, u32 count); +extern built_function copy_data_swap_u32; // Copy and swap data in 32-bit units, return true if changed -bool copy_data_swap_u32_cmp(void* dst, const void* src, u32 count); +extern built_function copy_data_swap_u32_cmp;