mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-22 02:32:36 +01:00
types.hpp: remove intrinsic includes
Replace v128 with u128 in some places. Removed some unused files.
This commit is contained in:
parent
5f618814f6
commit
bd269bccaf
@ -243,6 +243,18 @@ void fmt_class_string<v128>::format(std::string& out, u64 arg)
|
||||
fmt::append(out, "0x%016llx%016llx", vec._u64[1], vec._u64[0]);
|
||||
}
|
||||
|
||||
template <>
|
||||
void fmt_class_string<u128>::format(std::string& out, u64 arg)
|
||||
{
|
||||
// TODO: it should be supported as full-fledged integral type (with %u, %d, etc, fmt)
|
||||
const u128& num = get_object(arg);
|
||||
#ifdef _MSC_VER
|
||||
fmt::append(out, "0x%016llx%016llx", num.hi, num.lo);
|
||||
#else
|
||||
fmt::append(out, "0x%016llx%016llx", static_cast<u64>(num >> 64), static_cast<u64>(num));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
void fmt_class_string<src_loc>::format(std::string& out, u64 arg)
|
||||
{
|
||||
|
@ -76,6 +76,8 @@
|
||||
#include "util/vm.hpp"
|
||||
#include "util/logs.hpp"
|
||||
#include "util/asm.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
#include "Emu/Memory/vm_locking.h"
|
||||
|
||||
|
1043
Utilities/typemap.h
1043
Utilities/typemap.h
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,6 @@
|
||||
#include "Utilities/mutex.h"
|
||||
#include <cmath>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/asm.hpp"
|
||||
|
||||
LOG_CHANNEL(edat_log, "EDAT");
|
||||
@ -138,15 +137,15 @@ std::tuple<u64, s32, s32> dec_section(unsigned char* metadata)
|
||||
return std::make_tuple(offset, length, compression_end);
|
||||
}
|
||||
|
||||
v128 get_block_key(int block, NPD_HEADER *npd)
|
||||
u128 get_block_key(int block, NPD_HEADER *npd)
|
||||
{
|
||||
unsigned char empty_key[0x10] = {};
|
||||
unsigned char *src_key = (npd->version <= 1) ? empty_key : npd->dev_hash;
|
||||
v128 dest_key{};
|
||||
memcpy(dest_key._bytes, src_key, 0xC);
|
||||
u128 dest_key{};
|
||||
std::memcpy(&dest_key, src_key, 0xC);
|
||||
|
||||
s32 swappedBlock = swap32(block);
|
||||
memcpy(&dest_key._bytes[0xC], &swappedBlock, sizeof(swappedBlock));
|
||||
std::memcpy(reinterpret_cast<uchar*>(&dest_key) + 0xC, &swappedBlock, sizeof(swappedBlock));
|
||||
return dest_key;
|
||||
}
|
||||
|
||||
@ -251,7 +250,7 @@ s64 decrypt_block(const fs::file* in, u8* out, EDAT_HEADER *edat, NPD_HEADER *np
|
||||
auto b_key = get_block_key(block_num, npd);
|
||||
|
||||
// Encrypt the block key with the crypto key.
|
||||
aesecb128_encrypt(crypt_key, b_key._bytes, key_result);
|
||||
aesecb128_encrypt(crypt_key, reinterpret_cast<uchar*>(&b_key), key_result);
|
||||
if ((edat->flags & EDAT_FLAG_0x10) != 0)
|
||||
aesecb128_encrypt(crypt_key, key_result, hash); // If FLAG 0x10 is set, encrypt again to get the final hash.
|
||||
else
|
||||
@ -556,9 +555,10 @@ int validate_dev_klic(const u8* klicensee, NPD_HEADER *npd)
|
||||
memcpy(dev + 0xC, &type, 4);
|
||||
|
||||
// Check for an empty dev_hash (can't validate if devklic is NULL);
|
||||
auto klic = v128::loadu(klicensee);
|
||||
u128 klic;
|
||||
std::memcpy(&klic, klicensee, sizeof(klic));
|
||||
|
||||
if (klic == v128{})
|
||||
if (!klic)
|
||||
{
|
||||
// Allow empty dev hash.
|
||||
return 1;
|
||||
@ -566,10 +566,10 @@ int validate_dev_klic(const u8* klicensee, NPD_HEADER *npd)
|
||||
else
|
||||
{
|
||||
// Generate klicensee xor key.
|
||||
auto key = klic ^ std::bit_cast<v128>(NP_OMAC_KEY_2);
|
||||
u128 key = klic ^ std::bit_cast<u128>(NP_OMAC_KEY_2);
|
||||
|
||||
// Hash with generated key and compare with dev_hash.
|
||||
return cmac_hash_compare(key._bytes, 0x10, dev, 0x60, npd->dev_hash, 0x10);
|
||||
return cmac_hash_compare(reinterpret_cast<uchar*>(&key), 0x10, dev, 0x60, npd->dev_hash, 0x10);
|
||||
}
|
||||
}
|
||||
|
||||
@ -668,7 +668,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
}
|
||||
|
||||
// Set decryption key.
|
||||
v128 key{};
|
||||
u128 key{};
|
||||
|
||||
// Check EDAT/SDAT flag.
|
||||
if ((EDAT.flags & SDAT_FLAG) == SDAT_FLAG)
|
||||
@ -682,7 +682,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
}
|
||||
|
||||
// Generate SDAT key.
|
||||
key = std::bit_cast<v128>(NPD.dev_hash) ^ std::bit_cast<v128>(SDAT_KEY);
|
||||
key = std::bit_cast<u128>(NPD.dev_hash) ^ std::bit_cast<u128>(SDAT_KEY);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -715,7 +715,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
memcpy(&key, rifkey, 0x10);
|
||||
|
||||
// Make sure we don't have an empty RIF key.
|
||||
if (key == v128{})
|
||||
if (!key)
|
||||
{
|
||||
edat_log.error("EDAT: A valid RAP file is needed for this EDAT file! (local activation)");
|
||||
return 1;
|
||||
@ -726,7 +726,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
memcpy(&key, rifkey, 0x10);
|
||||
|
||||
// Make sure we don't have an empty RIF key.
|
||||
if (key == v128{})
|
||||
if (!key)
|
||||
{
|
||||
edat_log.error("EDAT: A valid RAP file is needed for this EDAT file! (network activation)");
|
||||
return 1;
|
||||
@ -735,7 +735,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
be_t<v128> data;
|
||||
be_t<u128> data;
|
||||
|
||||
std::memcpy(&data, devklic, sizeof(data));
|
||||
edat_log.notice("DEVKLIC: %s", data);
|
||||
@ -746,18 +746,18 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
edat_log.notice("DECRYPTION KEY: %s", std::bit_cast<be_t<v128>>(key));
|
||||
edat_log.notice("DECRYPTION KEY: %s", std::bit_cast<be_t<u128>>(key));
|
||||
}
|
||||
|
||||
input->seek(0);
|
||||
if (check_data(key._bytes, &EDAT, &NPD, input, verbose))
|
||||
if (check_data(reinterpret_cast<uchar*>(&key), &EDAT, &NPD, input, verbose))
|
||||
{
|
||||
edat_log.error("EDAT: Data parsing failed!");
|
||||
return 1;
|
||||
}
|
||||
|
||||
input->seek(0);
|
||||
if (decrypt_data(input, output, &EDAT, &NPD, key._bytes, verbose))
|
||||
if (decrypt_data(input, output, &EDAT, &NPD, reinterpret_cast<uchar*>(&key), verbose))
|
||||
{
|
||||
edat_log.error("EDAT: Data decryption failed!");
|
||||
return 1;
|
||||
@ -766,14 +766,14 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char*
|
||||
return 0;
|
||||
}
|
||||
|
||||
v128 GetEdatRifKeyFromRapFile(const fs::file& rap_file)
|
||||
u128 GetEdatRifKeyFromRapFile(const fs::file& rap_file)
|
||||
{
|
||||
v128 rapkey{};
|
||||
v128 rifkey{};
|
||||
u128 rapkey{};
|
||||
u128 rifkey{};
|
||||
|
||||
rap_file.read<v128>(rapkey);
|
||||
rap_file.read<u128>(rapkey);
|
||||
|
||||
rap_to_rif(rapkey._bytes, rifkey._bytes);
|
||||
rap_to_rif(reinterpret_cast<uchar*>(&rapkey), reinterpret_cast<uchar*>(&rifkey));
|
||||
|
||||
return rifkey;
|
||||
}
|
||||
@ -824,8 +824,8 @@ fs::file DecryptEDAT(const fs::file& input, const std::string& input_file_name,
|
||||
input.seek(0);
|
||||
|
||||
// Set keys (RIF and DEVKLIC).
|
||||
v128 rifKey{};
|
||||
v128 devklic{};
|
||||
u128 rifKey{};
|
||||
u128 devklic{};
|
||||
|
||||
// Select the EDAT key mode.
|
||||
switch (mode)
|
||||
@ -879,7 +879,7 @@ fs::file DecryptEDAT(const fs::file& input, const std::string& input_file_name,
|
||||
|
||||
// Delete the bad output file if any errors arise.
|
||||
fs::file output = fs::make_stream<std::vector<u8>>();
|
||||
if (extract_all_data(&input, &output, input_file_name.c_str(), devklic._bytes, rifKey._bytes, verbose))
|
||||
if (extract_all_data(&input, &output, input_file_name.c_str(), reinterpret_cast<uchar*>(&devklic), reinterpret_cast<uchar*>(&rifKey), verbose))
|
||||
{
|
||||
output.release();
|
||||
return fs::file{};
|
||||
@ -905,12 +905,12 @@ bool EDATADecrypter::ReadHeader()
|
||||
if ((edatHeader.flags & SDAT_FLAG) == SDAT_FLAG)
|
||||
{
|
||||
// Generate SDAT key.
|
||||
dec_key = std::bit_cast<v128>(npdHeader.dev_hash) ^ std::bit_cast<v128>(SDAT_KEY);
|
||||
dec_key = std::bit_cast<u128>(npdHeader.dev_hash) ^ std::bit_cast<u128>(SDAT_KEY);
|
||||
}
|
||||
else
|
||||
{
|
||||
// verify key
|
||||
if (validate_dev_klic(dev_key._bytes, &npdHeader) == 0)
|
||||
if (validate_dev_klic(reinterpret_cast<uchar*>(&dev_key), &npdHeader) == 0)
|
||||
{
|
||||
edat_log.error("EDAT: Failed validating klic");
|
||||
return false;
|
||||
@ -923,7 +923,7 @@ bool EDATADecrypter::ReadHeader()
|
||||
{
|
||||
dec_key = std::move(rif_key);
|
||||
|
||||
if (dec_key == v128{})
|
||||
if (!dec_key)
|
||||
{
|
||||
edat_log.warning("EDAT: Empty Dec key for local activation!");
|
||||
}
|
||||
@ -932,7 +932,7 @@ bool EDATADecrypter::ReadHeader()
|
||||
{
|
||||
dec_key = std::move(rif_key);
|
||||
|
||||
if (dec_key == v128{})
|
||||
if (!dec_key)
|
||||
{
|
||||
edat_log.warning("EDAT: Empty Dec key for network activation!");
|
||||
}
|
||||
@ -978,7 +978,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size)
|
||||
for (u32 i = starting_block; i < ending_block; ++i)
|
||||
{
|
||||
edata_file.seek(0);
|
||||
u64 res = decrypt_block(&edata_file, &data_buf[writeOffset], &edatHeader, &npdHeader, dec_key._bytes, i, total_blocks, edatHeader.file_size);
|
||||
u64 res = decrypt_block(&edata_file, &data_buf[writeOffset], &edatHeader, &npdHeader, reinterpret_cast<uchar*>(&dec_key), i, total_blocks, edatHeader.file_size);
|
||||
if (res == umax)
|
||||
{
|
||||
edat_log.error("Error Decrypting data");
|
||||
|
@ -6,8 +6,6 @@
|
||||
|
||||
#include "Utilities/File.h"
|
||||
|
||||
#include "util/v128.hpp"
|
||||
|
||||
constexpr u32 SDAT_FLAG = 0x01000000;
|
||||
constexpr u32 EDAT_COMPRESSED_FLAG = 0x00000001;
|
||||
constexpr u32 EDAT_FLAG_0x02 = 0x00000002;
|
||||
@ -18,8 +16,8 @@ constexpr u32 EDAT_DEBUG_DATA_FLAG = 0x80000000;
|
||||
|
||||
struct loaded_npdrm_keys
|
||||
{
|
||||
atomic_t<v128> devKlic{};
|
||||
atomic_t<v128> rifKey{};
|
||||
atomic_t<u128> devKlic{};
|
||||
atomic_t<u128> rifKey{};
|
||||
atomic_t<u32> npdrm_fds{0};
|
||||
};
|
||||
|
||||
@ -49,7 +47,7 @@ extern fs::file DecryptEDAT(const fs::file& input, const std::string& input_file
|
||||
|
||||
extern bool VerifyEDATHeaderWithKLicense(const fs::file& input, const std::string& input_file_name, const u8* custom_klic, std::string* contentID);
|
||||
|
||||
v128 GetEdatRifKeyFromRapFile(const fs::file& rap_file);
|
||||
u128 GetEdatRifKeyFromRapFile(const fs::file& rap_file);
|
||||
|
||||
struct EDATADecrypter final : fs::file_base
|
||||
{
|
||||
@ -66,18 +64,20 @@ struct EDATADecrypter final : fs::file_base
|
||||
std::unique_ptr<u8[]> data_buf;
|
||||
u64 data_buf_size{0};
|
||||
|
||||
v128 dec_key{};
|
||||
u128 dec_key{};
|
||||
|
||||
// edat usage
|
||||
v128 rif_key{};
|
||||
v128 dev_key{};
|
||||
u128 rif_key{};
|
||||
u128 dev_key{};
|
||||
public:
|
||||
// SdataByFd usage
|
||||
EDATADecrypter(fs::file&& input)
|
||||
: edata_file(std::move(input)) {}
|
||||
// Edat usage
|
||||
EDATADecrypter(fs::file&& input, const v128& dev_key, const v128& rif_key)
|
||||
: edata_file(std::move(input)), rif_key(rif_key), dev_key(dev_key) {}
|
||||
EDATADecrypter(fs::file&& input, const u128& dev_key, const u128& rif_key)
|
||||
: edata_file(std::move(input))
|
||||
, rif_key(rif_key)
|
||||
, dev_key(dev_key) {}
|
||||
|
||||
~EDATADecrypter() override {}
|
||||
// false if invalid
|
||||
|
@ -9,8 +9,6 @@
|
||||
#include <algorithm>
|
||||
#include <zlib.h>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
|
||||
inline u8 Read8(const fs::file& f)
|
||||
{
|
||||
u8 ret;
|
||||
@ -1489,7 +1487,7 @@ bool verify_npdrm_self_headers(const fs::file& self, u8* klic_key)
|
||||
return true;
|
||||
}
|
||||
|
||||
v128 get_default_self_klic()
|
||||
u128 get_default_self_klic()
|
||||
{
|
||||
return std::bit_cast<v128>(NP_KLIC_FREE);
|
||||
return std::bit_cast<u128>(NP_KLIC_FREE);
|
||||
}
|
||||
|
@ -509,5 +509,4 @@ private:
|
||||
fs::file decrypt_self(fs::file elf_or_self, u8* klic_key = nullptr, SelfAdditionalInfo* additional_info = nullptr);
|
||||
bool verify_npdrm_self_headers(const fs::file& self, u8* klic_key = nullptr);
|
||||
|
||||
union v128;
|
||||
v128 get_default_self_klic();
|
||||
u128 get_default_self_klic();
|
||||
|
@ -33,7 +33,6 @@ target_include_directories(rpcs3_emu
|
||||
# Utilities
|
||||
target_sources(rpcs3_emu PRIVATE
|
||||
../util/atomic.cpp
|
||||
../util/atomic2.cpp
|
||||
../util/fixed_typemap.cpp
|
||||
../util/logs.cpp
|
||||
../util/yaml.cpp
|
||||
|
@ -15,6 +15,9 @@
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
DECLARE(cpu_thread::g_threads_created){0};
|
||||
DECLARE(cpu_thread::g_threads_deleted){0};
|
||||
DECLARE(cpu_thread::g_suspend_counter){0};
|
||||
@ -938,7 +941,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
|
||||
break;
|
||||
}
|
||||
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
|
||||
// Second increment: all threads paused
|
||||
|
@ -2,6 +2,9 @@
|
||||
|
||||
#include "CPUTranslator.h"
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
|
||||
llvm::LLVMContext g_llvm_ctx;
|
||||
|
||||
cpu_translator::cpu_translator(llvm::Module* _module, bool is_be)
|
||||
|
@ -5,6 +5,9 @@
|
||||
#include "Emu/Cell/lv2/sys_process.h"
|
||||
#include "Emu/Cell/lv2/sys_event.h"
|
||||
#include "cellAudio.h"
|
||||
|
||||
#include "emmintrin.h"
|
||||
#include "immintrin.h"
|
||||
#include <cmath>
|
||||
|
||||
LOG_CHANNEL(cellAudio);
|
||||
|
@ -3,8 +3,6 @@
|
||||
#include "stdafx.h"
|
||||
#include <Emu/Memory/vm_ptr.h>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
|
||||
// Return codes
|
||||
enum CellSaveDataError : u32
|
||||
{
|
||||
@ -300,7 +298,7 @@ struct CellSaveDataFileSet
|
||||
be_t<u32> fileOperation;
|
||||
vm::bptr<void> reserved;
|
||||
be_t<u32> fileType;
|
||||
be_t<v128, 1> secureFileId;
|
||||
be_t<u128, 1> secureFileId;
|
||||
vm::bptr<char> fileName;
|
||||
be_t<u32> fileOffset;
|
||||
be_t<u32> fileSize;
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "cellSpurs.h"
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
|
||||
LOG_CHANNEL(cellSpurs);
|
||||
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <mutex>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
|
||||
LOG_CHANNEL(cellSpurs);
|
||||
|
||||
|
@ -15,8 +15,6 @@
|
||||
#include "Emu/NP/np_handler.h"
|
||||
#include "Emu/NP/np_contexts.h"
|
||||
|
||||
#include "util/v128.hpp"
|
||||
|
||||
LOG_CHANNEL(sceNp);
|
||||
|
||||
template <>
|
||||
@ -447,12 +445,12 @@ error_code sceNpTerm()
|
||||
|
||||
error_code npDrmIsAvailable(vm::cptr<u8> k_licensee_addr, vm::cptr<char> drm_path)
|
||||
{
|
||||
v128 k_licensee{};
|
||||
u128 k_licensee{};
|
||||
|
||||
if (k_licensee_addr)
|
||||
{
|
||||
std::memcpy(&k_licensee, k_licensee_addr.get_ptr(), sizeof(k_licensee));
|
||||
sceNp.notice("npDrmIsAvailable(): KLicense key %s", std::bit_cast<be_t<v128>>(k_licensee));
|
||||
sceNp.notice("npDrmIsAvailable(): KLicense key %s", std::bit_cast<be_t<u128>>(k_licensee));
|
||||
}
|
||||
|
||||
if (Emu.GetCat() == "PE")
|
||||
@ -488,7 +486,7 @@ error_code npDrmIsAvailable(vm::cptr<u8> k_licensee_addr, vm::cptr<char> drm_pat
|
||||
if (!k_licensee_addr)
|
||||
k_licensee = get_default_self_klic();
|
||||
|
||||
if (verify_npdrm_self_headers(enc_file, k_licensee._bytes))
|
||||
if (verify_npdrm_self_headers(enc_file, reinterpret_cast<u8*>(&k_licensee)))
|
||||
{
|
||||
npdrmkeys->devKlic = k_licensee;
|
||||
}
|
||||
@ -504,7 +502,7 @@ error_code npDrmIsAvailable(vm::cptr<u8> k_licensee_addr, vm::cptr<char> drm_pat
|
||||
|
||||
std::string contentID;
|
||||
|
||||
if (VerifyEDATHeaderWithKLicense(enc_file, enc_drm_path_local, k_licensee._bytes, &contentID))
|
||||
if (VerifyEDATHeaderWithKLicense(enc_file, enc_drm_path_local, reinterpret_cast<u8*>(&k_licensee), &contentID))
|
||||
{
|
||||
const std::string rap_file = rap_dir_path + contentID + ".rap";
|
||||
npdrmkeys->devKlic = k_licensee;
|
||||
|
@ -12,6 +12,7 @@
|
||||
|
||||
#include "util/asm.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#if !defined(_MSC_VER) && defined(__clang__)
|
||||
|
@ -66,6 +66,8 @@
|
||||
#include "util/asm.hpp"
|
||||
#include "util/vm.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
const bool s_use_ssse3 = utils::has_ssse3();
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "util/endian.hpp"
|
||||
#include "util/logs.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -12,6 +12,7 @@
|
||||
|
||||
#include "util/asm.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#include <cmath>
|
||||
@ -959,7 +960,7 @@ spu_recompiler::XmmLink spu_recompiler::XmmGet(s8 reg, XmmType type) // get xmm
|
||||
return result;
|
||||
}
|
||||
|
||||
inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data)
|
||||
inline asmjit::X86Mem spu_recompiler::XmmConst(const v128& data)
|
||||
{
|
||||
// Find existing const
|
||||
auto& xmm_label = xmm_consts[std::make_pair(data._u64[0], data._u64[1])];
|
||||
@ -980,12 +981,12 @@ inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data)
|
||||
return asmjit::x86::oword_ptr(xmm_label);
|
||||
}
|
||||
|
||||
inline asmjit::X86Mem spu_recompiler::XmmConst(__m128 data)
|
||||
inline asmjit::X86Mem spu_recompiler::XmmConst(const __m128& data)
|
||||
{
|
||||
return XmmConst(v128::fromF(data));
|
||||
}
|
||||
|
||||
inline asmjit::X86Mem spu_recompiler::XmmConst(__m128i data)
|
||||
inline asmjit::X86Mem spu_recompiler::XmmConst(const __m128i& data)
|
||||
{
|
||||
return XmmConst(v128::fromV(data));
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
union v128;
|
||||
|
||||
// SPU ASMJIT Recompiler
|
||||
class spu_recompiler : public spu_recompiler_base
|
||||
@ -87,9 +87,9 @@ private:
|
||||
XmmLink XmmAlloc();
|
||||
XmmLink XmmGet(s8 reg, XmmType type);
|
||||
|
||||
asmjit::X86Mem XmmConst(v128 data);
|
||||
asmjit::X86Mem XmmConst(__m128 data);
|
||||
asmjit::X86Mem XmmConst(__m128i data);
|
||||
asmjit::X86Mem XmmConst(const v128& data);
|
||||
asmjit::X86Mem XmmConst(const __m128& data);
|
||||
asmjit::X86Mem XmmConst(const __m128i& data);
|
||||
|
||||
asmjit::X86Mem get_pc(u32 addr);
|
||||
void branch_fixed(u32 target, bool absolute = false);
|
||||
|
@ -8,6 +8,7 @@ const spu_decoder<spu_itype> s_spu_itype;
|
||||
const spu_decoder<spu_iflag> s_spu_iflag;
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
|
||||
u32 SPUDisAsm::disasm(u32 pc)
|
||||
{
|
||||
@ -161,7 +162,7 @@ std::pair<bool, v128> SPUDisAsm::try_get_const_value(u32 reg, u32 pc) const
|
||||
return {};
|
||||
}
|
||||
|
||||
typename SPUDisAsm::insert_mask_info SPUDisAsm::try_get_insert_mask_info(v128 mask)
|
||||
typename SPUDisAsm::insert_mask_info SPUDisAsm::try_get_insert_mask_info(const v128& mask)
|
||||
{
|
||||
if ((mask & v128::from8p(0xe0)) != v128{})
|
||||
{
|
||||
@ -302,3 +303,29 @@ void SPUDisAsm::IOHL(spu_opcode_t op)
|
||||
|
||||
DisAsm("iohl", spu_reg_name[op.rt], op.i16);
|
||||
}
|
||||
|
||||
void SPUDisAsm::SHUFB(spu_opcode_t op)
|
||||
{
|
||||
const auto [is_const, value] = try_get_const_value(op.rc);
|
||||
|
||||
if (is_const)
|
||||
{
|
||||
const auto [size, dst, src] = try_get_insert_mask_info(value);
|
||||
|
||||
if (size)
|
||||
{
|
||||
if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u))
|
||||
{
|
||||
// Comment insertion pattern for CWD-alike instruction
|
||||
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u]", spu_reg_name[op.rc], size * 8, dst).c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
// Comment insertion pattern for unknown instruction formations
|
||||
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u] = [%u]", spu_reg_name[op.rc], size * 8, dst, src).c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
#include "PPCDisAsm.h"
|
||||
#include "SPUOpcodes.h"
|
||||
|
||||
#include "util/v128.hpp"
|
||||
union v128;
|
||||
|
||||
static constexpr const char* spu_reg_name[128] =
|
||||
{
|
||||
@ -172,7 +172,7 @@ public:
|
||||
u32 src_index;
|
||||
};
|
||||
|
||||
static insert_mask_info try_get_insert_mask_info(v128 mask);
|
||||
static insert_mask_info try_get_insert_mask_info(const v128& mask);
|
||||
|
||||
//0 - 10
|
||||
void STOP(spu_opcode_t op)
|
||||
@ -972,31 +972,7 @@ public:
|
||||
{
|
||||
DisAsm("selb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
|
||||
}
|
||||
void SHUFB(spu_opcode_t op)
|
||||
{
|
||||
const auto [is_const, value] = try_get_const_value(op.rc);
|
||||
|
||||
if (is_const)
|
||||
{
|
||||
const auto [size, dst, src] = try_get_insert_mask_info(value);
|
||||
|
||||
if (size)
|
||||
{
|
||||
if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u))
|
||||
{
|
||||
// Comment insertion pattern for CWD-alike instruction
|
||||
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u]", spu_reg_name[op.rc], size * 8, dst).c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
// Comment insertion pattern for unknown instruction formations
|
||||
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u] = [%u]", spu_reg_name[op.rc], size * 8, dst, src).c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
|
||||
}
|
||||
void SHUFB(spu_opcode_t op);
|
||||
void MPYA(spu_opcode_t op)
|
||||
{
|
||||
DisAsm("mpya", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]);
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#include "util/asm.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#include <cmath>
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <thread>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
extern atomic_t<const char*> g_progr;
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "util/vm.hpp"
|
||||
#include "util/asm.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/v128sse.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
using spu_rdata_t = decltype(spu_thread::rdata);
|
||||
@ -1558,7 +1559,7 @@ void spu_thread::cpu_return()
|
||||
for (u32 status; !thread->exit_status.try_read(status)
|
||||
|| status != thread->last_exit_status;)
|
||||
{
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -504,7 +504,7 @@ struct spu_imm_table_t
|
||||
public:
|
||||
scale_table_t();
|
||||
|
||||
FORCE_INLINE __m128 operator [] (s32 scale) const
|
||||
FORCE_INLINE const auto& operator [](s32 scale) const
|
||||
{
|
||||
return m_data[scale + 155].vf;
|
||||
}
|
||||
|
@ -32,7 +32,9 @@ static error_code overlay_load_module(vm::ptr<u32> ovlmid, const std::string& vp
|
||||
src = std::move(lv2_file);
|
||||
}
|
||||
|
||||
const ppu_exec_object obj = decrypt_self(std::move(src), g_fxo->get<loaded_npdrm_keys>()->devKlic.load()._bytes);
|
||||
u128 klic = g_fxo->get<loaded_npdrm_keys>()->devKlic.load();
|
||||
|
||||
const ppu_exec_object obj = decrypt_self(std::move(src), reinterpret_cast<u8*>(&klic));
|
||||
|
||||
if (obj != elf_error::ok)
|
||||
{
|
||||
|
@ -403,10 +403,9 @@ void _sys_process_exit2(ppu_thread& ppu, s32 status, vm::ptr<sys_exit2_param> ar
|
||||
Emu.disc = std::move(disc);
|
||||
Emu.hdd1 = std::move(hdd1);
|
||||
|
||||
if (klic != v128{})
|
||||
if (klic)
|
||||
{
|
||||
// TODO: Use std::optional
|
||||
Emu.klic.assign(std::begin(klic._bytes), std::end(klic._bytes));
|
||||
Emu.klic.emplace_back(klic);
|
||||
}
|
||||
|
||||
Emu.SetForceBoot(true);
|
||||
|
@ -263,7 +263,9 @@ static error_code prx_load_module(const std::string& vpath, u64 flags, vm::ptr<s
|
||||
src = std::move(lv2_file);
|
||||
}
|
||||
|
||||
const ppu_prx_object obj = decrypt_self(std::move(src), g_fxo->get<loaded_npdrm_keys>()->devKlic.load()._bytes);
|
||||
u128 klic = g_fxo->get<loaded_npdrm_keys>()->devKlic.load();
|
||||
|
||||
const ppu_prx_object obj = decrypt_self(std::move(src), reinterpret_cast<u8*>(&klic));
|
||||
|
||||
if (obj != elf_error::ok)
|
||||
{
|
||||
|
@ -251,7 +251,9 @@ error_code sys_spu_image_open(ppu_thread& ppu, vm::ptr<sys_spu_image> img, vm::c
|
||||
return {fs_error, path};
|
||||
}
|
||||
|
||||
const fs::file elf_file = decrypt_self(std::move(file), g_fxo->get<loaded_npdrm_keys>()->devKlic.load()._bytes);
|
||||
u128 klic = g_fxo->get<loaded_npdrm_keys>()->devKlic.load();
|
||||
|
||||
const fs::file elf_file = decrypt_self(std::move(file), reinterpret_cast<u8*>(&klic));
|
||||
|
||||
if (!elf_file)
|
||||
{
|
||||
|
@ -323,7 +323,7 @@ namespace vm
|
||||
break;
|
||||
}
|
||||
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
|
||||
@ -525,7 +525,7 @@ namespace vm
|
||||
break;
|
||||
}
|
||||
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
|
||||
for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++)
|
||||
@ -533,7 +533,9 @@ namespace vm
|
||||
if (auto ptr = +*lock)
|
||||
{
|
||||
while (!(ptr->state & cpu_flag::wait))
|
||||
_mm_pause();
|
||||
{
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1606,7 +1608,7 @@ namespace vm
|
||||
case 2: atomic_storage<u16>::release(*static_cast<u16*>(dst), *static_cast<u16*>(src)); break;
|
||||
case 4: atomic_storage<u32>::release(*static_cast<u32*>(dst), *static_cast<u32*>(src)); break;
|
||||
case 8: atomic_storage<u64>::release(*static_cast<u64*>(dst), *static_cast<u64*>(src)); break;
|
||||
case 16: _mm_store_si128(static_cast<__m128i*>(dst), _mm_loadu_si128(static_cast<__m128i*>(src))); break;
|
||||
case 16: atomic_storage<u128>::release(*static_cast<u128*>(dst), *static_cast<u128*>(src)); break;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -8,8 +8,26 @@
|
||||
extern bool g_use_rtm;
|
||||
extern u64 g_rtm_tx_limit2;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C"
|
||||
{
|
||||
u64 __rdtsc();
|
||||
u32 _xbegin();
|
||||
void _xend();
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace vm
|
||||
{
|
||||
inline u64 get_tsc()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
return __rdtsc();
|
||||
#else
|
||||
return __builtin_ia32_rdtsc();
|
||||
#endif
|
||||
}
|
||||
|
||||
enum : u64
|
||||
{
|
||||
rsrv_lock_mask = 127,
|
||||
@ -81,28 +99,28 @@ namespace vm
|
||||
const auto sptr = vm::get_super_ptr<T>(static_cast<u32>(ptr.addr()));
|
||||
|
||||
// Prefetch some data
|
||||
_m_prefetchw(sptr);
|
||||
_m_prefetchw(reinterpret_cast<char*>(sptr) + 64);
|
||||
//_m_prefetchw(sptr);
|
||||
//_m_prefetchw(reinterpret_cast<char*>(sptr) + 64);
|
||||
|
||||
// Use 128-byte aligned addr
|
||||
const u32 addr = static_cast<u32>(ptr.addr()) & -128;
|
||||
|
||||
auto& res = vm::reservation_acquire(addr, 128);
|
||||
_m_prefetchw(&res);
|
||||
//_m_prefetchw(&res);
|
||||
|
||||
if (g_use_rtm)
|
||||
{
|
||||
// Stage 1: single optimistic transaction attempt
|
||||
unsigned status = _XBEGIN_STARTED;
|
||||
unsigned status = -1;
|
||||
u64 _old = 0;
|
||||
|
||||
auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0;
|
||||
auto stamp0 = get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
|
||||
|
||||
#ifndef _MSC_VER
|
||||
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
|
||||
#else
|
||||
status = _xbegin();
|
||||
if (status == _XBEGIN_STARTED)
|
||||
if (status == umax)
|
||||
#endif
|
||||
{
|
||||
if (res & rsrv_unique_lock)
|
||||
@ -158,16 +176,16 @@ namespace vm
|
||||
#ifndef _MSC_VER
|
||||
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
|
||||
#endif
|
||||
stamp1 = __rdtsc();
|
||||
stamp1 = get_tsc();
|
||||
|
||||
// Stage 2: try to lock reservation first
|
||||
_old = res.fetch_add(1);
|
||||
|
||||
// Compute stamps excluding memory touch
|
||||
stamp2 = __rdtsc() - (stamp1 - stamp0);
|
||||
stamp2 = get_tsc() - (stamp1 - stamp0);
|
||||
|
||||
// Start lightened transaction
|
||||
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc())
|
||||
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = get_tsc())
|
||||
{
|
||||
if (cpu.has_pause_flag())
|
||||
{
|
||||
@ -179,7 +197,7 @@ namespace vm
|
||||
#else
|
||||
status = _xbegin();
|
||||
|
||||
if (status != _XBEGIN_STARTED) [[unlikely]]
|
||||
if (status != umax) [[unlikely]]
|
||||
{
|
||||
goto retry;
|
||||
}
|
||||
|
@ -3,10 +3,12 @@
|
||||
#include "../rsx_methods.h"
|
||||
#include "../RSXThread.h"
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/to_endian.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#include "emmintrin.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
#define DEBUG_VERTEX_STREAMING 0
|
||||
|
||||
#if !defined(_MSC_VER) && defined(__clang__)
|
||||
@ -166,7 +168,7 @@ namespace
|
||||
const u32 dword_count = size >> 2;
|
||||
const u32 iterations = dword_count >> 2;
|
||||
|
||||
v128 bits_diff{};
|
||||
__m128i bits_diff = _mm_setzero_si128();
|
||||
|
||||
if (s_use_ssse3) [[likely]]
|
||||
{
|
||||
@ -177,12 +179,12 @@ namespace
|
||||
|
||||
if constexpr (!unaligned)
|
||||
{
|
||||
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector));
|
||||
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector));
|
||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||
}
|
||||
else
|
||||
{
|
||||
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector));
|
||||
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector));
|
||||
_mm_storeu_si128(dst_ptr, shuffled_vector);
|
||||
}
|
||||
|
||||
@ -200,12 +202,12 @@ namespace
|
||||
|
||||
if constexpr (!unaligned)
|
||||
{
|
||||
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), vec2));
|
||||
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), vec2));
|
||||
_mm_stream_si128(dst_ptr, vec2);
|
||||
}
|
||||
else
|
||||
{
|
||||
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2));
|
||||
bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2));
|
||||
_mm_storeu_si128(dst_ptr, vec2);
|
||||
}
|
||||
|
||||
@ -228,12 +230,12 @@ namespace
|
||||
if (dst_ptr2[i] != data)
|
||||
{
|
||||
dst_ptr2[i] = data;
|
||||
bits_diff._u32[0] = UINT32_MAX;
|
||||
bits_diff = _mm_set1_epi64x(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bits_diff != v128{};
|
||||
return _mm_cvtsi128_si64(_mm_packs_epi32(bits_diff, bits_diff)) != 0;
|
||||
}
|
||||
|
||||
template bool stream_data_to_memory_swapped_and_compare_u32<false>(void *dst, const void *src, u32 size);
|
||||
|
@ -284,7 +284,7 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R
|
||||
{
|
||||
const auto inst1 = v128::loadu(instBuffer1, instIndex);
|
||||
const auto inst2 = v128::loadu(instBuffer2, instIndex);
|
||||
if (inst1 != inst2)
|
||||
if (inst1._u ^ inst2._u)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@ -475,7 +475,7 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con
|
||||
const auto inst1 = v128::loadu(instBuffer1, instIndex);
|
||||
const auto inst2 = v128::loadu(instBuffer2, instIndex);
|
||||
|
||||
if (inst1 != inst2)
|
||||
if (inst1._u ^ inst2._u)
|
||||
return false;
|
||||
|
||||
instIndex++;
|
||||
|
@ -397,62 +397,7 @@ public:
|
||||
std::forward<Args>(args)...); // Other arguments
|
||||
}
|
||||
|
||||
void fill_fragment_constants_buffer(gsl::span<f32> dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize = false) const
|
||||
{
|
||||
const auto I = m_fragment_shader_cache.find(fragment_program);
|
||||
if (I == m_fragment_shader_cache.end())
|
||||
return;
|
||||
|
||||
ensure((dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16u));
|
||||
|
||||
f32* dst = dst_buffer.data();
|
||||
alignas(16) f32 tmp[4];
|
||||
for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
|
||||
{
|
||||
char* data = static_cast<char*>(fragment_program.get_data()) + offset_in_fragment_program;
|
||||
const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
|
||||
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
|
||||
|
||||
if (!patch_table.is_empty())
|
||||
{
|
||||
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
|
||||
bool patched;
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
patched = false;
|
||||
for (auto& e : patch_table.db)
|
||||
{
|
||||
//TODO: Use fp comparison with fabsf without hurting performance
|
||||
patched = e.second.test_and_set(tmp[i], &dst[i]);
|
||||
if (patched)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!patched)
|
||||
{
|
||||
dst[i] = tmp[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (sanitize)
|
||||
{
|
||||
//Convert NaNs and Infs to 0
|
||||
const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff));
|
||||
const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000));
|
||||
const auto result = _mm_and_si128(shuffled_vector, valid);
|
||||
_mm_stream_si128(std::bit_cast<__m128i*>(dst), result);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_stream_si128(std::bit_cast<__m128i*>(dst), shuffled_vector);
|
||||
}
|
||||
|
||||
dst += 4;
|
||||
}
|
||||
}
|
||||
void fill_fragment_constants_buffer(gsl::span<f32> dst_buffer, const RSXFragmentProgram& fragment_program, bool sanitize = false) const;
|
||||
|
||||
void clear()
|
||||
{
|
||||
|
64
rpcs3/Emu/RSX/Common/program_state_cache2.hpp
Normal file
64
rpcs3/Emu/RSX/Common/program_state_cache2.hpp
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
|
||||
#include "ProgramStateCache.h"
|
||||
|
||||
#include "emmintrin.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
template <typename Traits>
|
||||
void program_state_cache<Traits>::fill_fragment_constants_buffer(gsl::span<f32> dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize) const
|
||||
{
|
||||
const auto I = m_fragment_shader_cache.find(fragment_program);
|
||||
if (I == m_fragment_shader_cache.end())
|
||||
return;
|
||||
|
||||
ensure((dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16u));
|
||||
|
||||
f32* dst = dst_buffer.data();
|
||||
alignas(16) f32 tmp[4];
|
||||
for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
|
||||
{
|
||||
char* data = static_cast<char*>(fragment_program.get_data()) + offset_in_fragment_program;
|
||||
const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data));
|
||||
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
|
||||
|
||||
if (!patch_table.is_empty())
|
||||
{
|
||||
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
|
||||
bool patched;
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
patched = false;
|
||||
for (auto& e : patch_table.db)
|
||||
{
|
||||
//TODO: Use fp comparison with fabsf without hurting performance
|
||||
patched = e.second.test_and_set(tmp[i], &dst[i]);
|
||||
if (patched)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!patched)
|
||||
{
|
||||
dst[i] = tmp[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (sanitize)
|
||||
{
|
||||
//Convert NaNs and Infs to 0
|
||||
const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff));
|
||||
const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000));
|
||||
const auto result = _mm_and_si128(shuffled_vector, valid);
|
||||
_mm_stream_si128(std::bit_cast<__m128i*>(dst), result);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_stream_si128(std::bit_cast<__m128i*>(dst), shuffled_vector);
|
||||
}
|
||||
|
||||
dst += 4;
|
||||
}
|
||||
}
|
@ -7,6 +7,8 @@
|
||||
#include "Emu/Memory/vm_locking.h"
|
||||
#include "Emu/RSX/rsx_methods.h"
|
||||
|
||||
#include "../Common/program_state_cache2.hpp"
|
||||
|
||||
#define DUMP_VERTEX_DATA 0
|
||||
|
||||
u64 GLGSRender::get_cycles()
|
||||
|
@ -3,6 +3,8 @@
|
||||
#include "Emu/System.h"
|
||||
#include "Emu/Cell/Modules/cellMsgDialog.h"
|
||||
|
||||
#include "util/asm.hpp"
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
void shader_loading_dialog::create(const std::string& msg, const std::string& title)
|
||||
@ -27,7 +29,7 @@ namespace rsx
|
||||
|
||||
while (ref_cnt.load() && !Emu.IsStopped())
|
||||
{
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,7 +89,7 @@ namespace rsx
|
||||
{
|
||||
while (ref_cnt.load() && !Emu.IsStopped())
|
||||
{
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "rsx_utils.h"
|
||||
|
||||
#include <thread>
|
||||
#include "util/asm.hpp"
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
@ -171,13 +172,13 @@ namespace rsx
|
||||
while (_thr->m_enqueued_count.load() > _thr->m_processed_count.load())
|
||||
{
|
||||
rsxthr->on_semaphore_acquire_wait();
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (_thr->m_enqueued_count.load() > _thr->m_processed_count.load())
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -862,7 +862,7 @@ namespace rsx
|
||||
|
||||
for (; t == now; now = get_time_ns())
|
||||
{
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
|
||||
timestamp_ctrl = now;
|
||||
@ -2662,7 +2662,7 @@ namespace rsx
|
||||
|
||||
for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++)
|
||||
{
|
||||
const u32 io = utils::ror32(iomap_table.io[ea], 20);
|
||||
const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20);
|
||||
|
||||
if (io + 1)
|
||||
{
|
||||
@ -2747,7 +2747,7 @@ namespace rsx
|
||||
if (Emu.IsStopped())
|
||||
break;
|
||||
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
}
|
||||
|
||||
@ -2771,7 +2771,7 @@ namespace rsx
|
||||
while (external_interrupt_lock)
|
||||
{
|
||||
// TODO: Investigate non busy-spinning method
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
|
||||
external_interrupt_ack.store(false);
|
||||
|
@ -10,6 +10,8 @@
|
||||
#include "Emu/RSX/rsx_methods.h"
|
||||
#include "Emu/Memory/vm_locking.h"
|
||||
|
||||
#include "../Common/program_state_cache2.hpp"
|
||||
|
||||
#include "util/asm.hpp"
|
||||
|
||||
namespace vk
|
||||
@ -679,7 +681,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
|
||||
// Wait for deadlock to clear
|
||||
while (m_queue_status & flush_queue_state::deadlock)
|
||||
{
|
||||
_mm_pause();
|
||||
utils::pause();
|
||||
}
|
||||
|
||||
g_fxo->get<rsx::dma_manager>()->clear_mem_fault_flag();
|
||||
|
@ -300,7 +300,11 @@ namespace vk
|
||||
{
|
||||
while (num_waiters.load() != 0)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
_mm_pause();
|
||||
#else
|
||||
__builtin_ia32_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1006,7 +1006,11 @@ namespace vk
|
||||
}
|
||||
|
||||
//std::this_thread::yield();
|
||||
#ifdef _MSC_VER
|
||||
_mm_pause();
|
||||
#else
|
||||
__builtin_ia32_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -24,6 +24,10 @@
|
||||
|
||||
#include "3rdparty/GPUOpen/include/vk_mem_alloc.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C" void _mm_pause();
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define VK_DISABLE_COMPONENT_SWIZZLE 1
|
||||
#else
|
||||
@ -1231,7 +1235,11 @@ private:
|
||||
{
|
||||
while (!flushed)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
_mm_pause();
|
||||
#else
|
||||
__builtin_ia32_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1564,7 +1564,7 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool
|
||||
elf_file.open(decrypted_path);
|
||||
}
|
||||
// Decrypt SELF
|
||||
else if ((elf_file = decrypt_self(std::move(elf_file), klic.empty() ? nullptr : klic.data(), &g_ps3_process_info.self_info)))
|
||||
else if ((elf_file = decrypt_self(std::move(elf_file), klic.empty() ? nullptr : reinterpret_cast<u8*>(&klic[0]), &g_ps3_process_info.self_info)))
|
||||
{
|
||||
if (true)
|
||||
{
|
||||
|
@ -124,7 +124,7 @@ public:
|
||||
std::vector<std::string> argv;
|
||||
std::vector<std::string> envp;
|
||||
std::vector<u8> data;
|
||||
std::vector<u8> klic;
|
||||
std::vector<u128> klic;
|
||||
std::string disc;
|
||||
std::string hdd1;
|
||||
|
||||
|
@ -707,7 +707,7 @@ std::string vfs::unescape(std::string_view name)
|
||||
|
||||
std::string vfs::host::hash_path(const std::string& path, const std::string& dev_root)
|
||||
{
|
||||
return fmt::format(u8"%s/$%s%s", dev_root, fmt::base57(std::hash<std::string>()(path)), fmt::base57(__rdtsc()));
|
||||
return fmt::format(u8"%s/$%s%s", dev_root, fmt::base57(std::hash<std::string>()(path)), fmt::base57(utils::get_unique_tsc()));
|
||||
}
|
||||
|
||||
bool vfs::host::rename(const std::string& from, const std::string& to, const lv2_fs_mount_point* mp, bool overwrite)
|
||||
|
@ -1,6 +1,8 @@
|
||||
#include "stdafx.h"
|
||||
#include "stdafx.h"
|
||||
#include "perf_meter.hpp"
|
||||
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
|
||||
@ -65,6 +67,36 @@ void perf_stat_base::print(const char* name) noexcept
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C" void _mm_lfence();
|
||||
#endif
|
||||
|
||||
SAFE_BUFFERS void perf_stat_base::push(u64 data[66], u64 start_time, const char* name) noexcept
|
||||
{
|
||||
// Event end
|
||||
#ifdef _MSC_VER
|
||||
const u64 end_time = (_mm_lfence(), get_tsc());
|
||||
#else
|
||||
const u64 end_time = (__builtin_ia32_lfence(), get_tsc());
|
||||
#endif
|
||||
|
||||
// Compute difference in seconds
|
||||
const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq();
|
||||
|
||||
// Register perf stat in nanoseconds
|
||||
const u64 ns = static_cast<u64>(diff * 1000'000'000.);
|
||||
|
||||
// Print in microseconds
|
||||
if (static_cast<u64>(diff * 1000'000.) >= g_cfg.core.perf_report_threshold)
|
||||
{
|
||||
perf_log.notice(u8"%s: %.3fµs", name, diff * 1000'000.);
|
||||
}
|
||||
|
||||
data[0] += ns != 0;
|
||||
data[64 - std::countl_zero(ns)]++;
|
||||
data[65] += ns;
|
||||
}
|
||||
|
||||
static shared_mutex s_perf_mutex;
|
||||
|
||||
static std::map<std::string, perf_stat_base> s_perf_acc;
|
||||
|
@ -7,10 +7,22 @@
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
LOG_CHANNEL(perf_log, "PERF");
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C" u64 __rdtsc();
|
||||
|
||||
inline u64 get_tsc()
|
||||
{
|
||||
return __rdtsc();
|
||||
}
|
||||
#else
|
||||
inline u64 get_tsc()
|
||||
{
|
||||
return __builtin_ia32_rdtsc();
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: constexpr with the help of bitcast
|
||||
template <auto Name>
|
||||
inline const auto perf_name = []
|
||||
@ -32,6 +44,9 @@ protected:
|
||||
// Accumulate values from a thread
|
||||
void push(u64 ns[66]) noexcept;
|
||||
|
||||
// Get end time; accumulate value to the TLS
|
||||
static void push(u64 ns[66], u64 start_time, const char* name) noexcept;
|
||||
|
||||
// Register TLS storage for stats
|
||||
static void add(u64 ns[66], const char* name) noexcept;
|
||||
|
||||
@ -73,27 +88,9 @@ class perf_stat final : public perf_stat_base
|
||||
} g_tls_perf_stat;
|
||||
|
||||
public:
|
||||
static NEVER_INLINE void push(u64 start_time) noexcept
|
||||
static SAFE_BUFFERS FORCE_INLINE void push(u64 start_time) noexcept
|
||||
{
|
||||
// Event end
|
||||
const u64 end_time = (_mm_lfence(), __rdtsc());
|
||||
|
||||
// Compute difference in seconds
|
||||
const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq();
|
||||
|
||||
// Register perf stat in nanoseconds
|
||||
const u64 ns = static_cast<u64>(diff * 1000'000'000.);
|
||||
|
||||
// Print in microseconds
|
||||
if (static_cast<u64>(diff * 1000'000.) >= g_cfg.core.perf_report_threshold)
|
||||
{
|
||||
perf_log.notice(u8"%s: %.3fµs", perf_name<ShortName>.data(), diff * 1000'000.);
|
||||
}
|
||||
|
||||
auto& data = g_tls_perf_stat.m_log;
|
||||
data[0] += ns != 0;
|
||||
data[64 - std::countl_zero(ns)]++;
|
||||
data[65] += ns;
|
||||
perf_stat_base::push(g_tls_perf_stat.m_log, start_time, perf_name<ShortName>.data());
|
||||
}
|
||||
};
|
||||
|
||||
@ -149,7 +146,7 @@ public:
|
||||
if constexpr (std::array<bool, sizeof...(SubEvents)>{(SubEvents == Event)...}[Index])
|
||||
{
|
||||
// Push actual timestamp into an array
|
||||
m_timestamps[Index + 1] = __rdtsc();
|
||||
m_timestamps[Index + 1] = get_tsc();
|
||||
}
|
||||
else if constexpr (Index < sizeof...(SubEvents))
|
||||
{
|
||||
@ -173,7 +170,7 @@ public:
|
||||
// Re-initialize first timestamp
|
||||
SAFE_BUFFERS FORCE_INLINE void restart() noexcept
|
||||
{
|
||||
m_timestamps[0] = __rdtsc();
|
||||
m_timestamps[0] = get_tsc();
|
||||
std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64));
|
||||
}
|
||||
|
||||
|
@ -23,7 +23,7 @@ bool TRPLoader::Install(const std::string& dest, bool show)
|
||||
|
||||
const std::string& local_path = vfs::get(dest);
|
||||
|
||||
const auto temp = fmt::format(u8"%s.$temp$%u", local_path, __rdtsc());
|
||||
const auto temp = fmt::format(u8"%s.$temp$%u", local_path, utils::get_unique_tsc());
|
||||
|
||||
if (!fs::create_dir(temp))
|
||||
{
|
||||
|
@ -120,9 +120,6 @@
|
||||
<ClCompile Include="util\atomic.cpp">
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="util\atomic2.cpp">
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="util\yaml.cpp">
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<ExceptionHandling>Sync</ExceptionHandling>
|
||||
@ -516,6 +513,7 @@
|
||||
<ClInclude Include="Emu\system_config_types.h" />
|
||||
<ClInclude Include="util\atomic.hpp" />
|
||||
<ClInclude Include="util\v128.hpp" />
|
||||
<ClInclude Include="util\v128sse.hpp" />
|
||||
<ClInclude Include="util\to_endian.hpp" />
|
||||
<ClInclude Include="..\Utilities\bin_patch.h" />
|
||||
<ClInclude Include="..\Utilities\BitField.h" />
|
||||
@ -531,7 +529,6 @@
|
||||
<ClInclude Include="..\Utilities\mutex.h" />
|
||||
<ClInclude Include="..\Utilities\sema.h" />
|
||||
<ClInclude Include="..\Utilities\sync.h" />
|
||||
<ClInclude Include="util\atomic2.hpp" />
|
||||
<ClInclude Include="util\endian.hpp" />
|
||||
<ClInclude Include="util\fixed_typemap.hpp" />
|
||||
<ClInclude Include="util\init_mutex.hpp" />
|
||||
@ -742,6 +739,7 @@
|
||||
<ClInclude Include="Emu\RSX\Common\BufferUtils.h" />
|
||||
<ClInclude Include="Emu\RSX\Common\FragmentProgramDecompiler.h" />
|
||||
<ClInclude Include="Emu\RSX\Common\ProgramStateCache.h" />
|
||||
<ClInclude Include="Emu\RSX\Common\program_state_cache2.hpp" />
|
||||
<ClInclude Include="Emu\RSX\Common\ring_buffer_helper.h" />
|
||||
<ClInclude Include="Emu\RSX\Common\ShaderParam.h" />
|
||||
<ClInclude Include="Emu\RSX\Common\surface_store.h" />
|
||||
|
@ -935,9 +935,6 @@
|
||||
<ClCompile Include="Emu\RSX\Overlays\overlay_osk_panel.cpp">
|
||||
<Filter>Emu\GPU\RSX\Overlays</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="util\atomic2.cpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="util\logs.cpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClCompile>
|
||||
@ -1072,6 +1069,9 @@
|
||||
<ClInclude Include="util\v128.hpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="util\v128sse.hpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="util\to_endian.hpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
@ -1153,6 +1153,9 @@
|
||||
<ClInclude Include="Emu\RSX\Common\ProgramStateCache.h">
|
||||
<Filter>Emu\GPU\RSX\Common</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Common\program_state_cache2.hpp">
|
||||
<Filter>Emu\GPU\RSX\Common</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\Common\FragmentProgramDecompiler.h">
|
||||
<Filter>Emu\GPU\RSX\Common</Filter>
|
||||
</ClInclude>
|
||||
@ -1819,9 +1822,6 @@
|
||||
<ClInclude Include="util\yaml.hpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="util\atomic2.hpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="util\endian.hpp">
|
||||
<Filter>Utilities</Filter>
|
||||
</ClInclude>
|
||||
|
@ -44,7 +44,6 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti
|
||||
#include <thread>
|
||||
#include <charconv>
|
||||
|
||||
#include "util/v128.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
inline std::string sstr(const QString& _in) { return _in.toStdString(); }
|
||||
@ -301,8 +300,6 @@ int main(int argc, char** argv)
|
||||
const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull;
|
||||
#endif
|
||||
|
||||
v128::use_fma = utils::has_fma3();
|
||||
|
||||
s_argv0 = argv[0]; // Save for report_fatal_error
|
||||
|
||||
// Only run RPCS3 to display an error
|
||||
|
@ -885,12 +885,12 @@ void main_window::DecryptSPRXLibraries()
|
||||
gui_log.notice("Decrypting binaries...");
|
||||
|
||||
// Always start with no KLIC
|
||||
std::vector<v128> klics{v128{}};
|
||||
std::vector<u128> klics{u128{}};
|
||||
|
||||
if (const auto keys = g_fxo->get<loaded_npdrm_keys>())
|
||||
{
|
||||
// Second klic: get it from a running game
|
||||
if (const v128 klic = keys->devKlic; klic != v128{})
|
||||
if (const u128 klic = keys->devKlic)
|
||||
{
|
||||
klics.emplace_back(klic);
|
||||
}
|
||||
@ -913,7 +913,7 @@ void main_window::DecryptSPRXLibraries()
|
||||
if (elf_file.open(old_path) && elf_file.size() >= 4 && elf_file.read<u32>() == "SCE\0"_u32)
|
||||
{
|
||||
// First KLIC is no KLIC
|
||||
elf_file = decrypt_self(std::move(elf_file), key_it != 0 ? klics[key_it]._bytes : nullptr);
|
||||
elf_file = decrypt_self(std::move(elf_file), key_it != 0 ? reinterpret_cast<u8*>(&klics[key_it]) : nullptr);
|
||||
|
||||
if (!elf_file)
|
||||
{
|
||||
@ -985,11 +985,15 @@ void main_window::DecryptSPRXLibraries()
|
||||
ensure(text.size() == 32);
|
||||
|
||||
// It must succeed (only hex characters are present)
|
||||
std::from_chars(&text[0], &text[16], klic._u64[1], 16); // Not a typo: on LE systems the u64[1] part will be swapped with u64[0] later
|
||||
std::from_chars(&text[16], &text[32], klic._u64[0], 16); // And on BE systems it will be already swapped by index internally
|
||||
u64 lo_ = 0;
|
||||
u64 hi_ = 0;
|
||||
std::from_chars(&text[0], &text[16], lo_, 16);
|
||||
std::from_chars(&text[16], &text[32], hi_, 16);
|
||||
|
||||
// Needs to be in big endian because the left to right byte-order means big endian
|
||||
klic = std::bit_cast<be_t<v128>>(klic);
|
||||
be_t<u64> lo = std::bit_cast<be_t<u64>>(lo_);
|
||||
be_t<u64> hi = std::bit_cast<be_t<u64>>(hi_);
|
||||
|
||||
klic = (u128{+hi} << 64) | +lo;
|
||||
|
||||
// Retry with specified KLIC
|
||||
key_it -= +std::exchange(tried, true); // Rewind on second and above attempt
|
||||
|
@ -179,7 +179,7 @@ void register_editor_dialog::updateRegister(int reg)
|
||||
else if (reg >= ppu_v0 && reg <= ppu_v31)
|
||||
{
|
||||
const auto r = ppu.vr[reg_index];
|
||||
str = r == v128::from32p(r._u32[0]) ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
|
||||
str = !r._u ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
|
||||
}
|
||||
}
|
||||
else if (reg == PPU_CR) str = fmt::format("%08x", ppu.cr.pack());
|
||||
@ -198,7 +198,7 @@ void register_editor_dialog::updateRegister(int reg)
|
||||
{
|
||||
const u32 reg_index = reg % 128;
|
||||
const auto r = spu.gpr[reg_index];
|
||||
str = r == v128::from32p(r._u32[0]) ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
|
||||
str = !r._u ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
|
||||
}
|
||||
else if (reg == MFC_PEVENTS) str = fmt::format("%08x", +spu.ch_events.load().events);
|
||||
else if (reg == MFC_EVENTS_MASK) str = fmt::format("%08x", +spu.ch_events.load().mask);
|
||||
|
@ -5,22 +5,54 @@
|
||||
extern bool g_use_rtm;
|
||||
extern u64 g_rtm_tx_limit1;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C"
|
||||
{
|
||||
u64 __rdtsc();
|
||||
u32 _xbegin();
|
||||
void _xend();
|
||||
void _mm_pause();
|
||||
void _mm_prefetch(const char*, int);
|
||||
void _m_prefetchw(const volatile void*);
|
||||
|
||||
uchar _rotl8(uchar, uchar);
|
||||
ushort _rotl16(ushort, uchar);
|
||||
uint _rotl(uint, int);
|
||||
u64 _rotl64(u64, int);
|
||||
|
||||
s64 __mulh(s64, s64);
|
||||
u64 __umulh(u64, u64);
|
||||
|
||||
s64 _div128(s64, s64, s64, s64*);
|
||||
u64 _udiv128(u64, u64, u64, u64*);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace utils
|
||||
{
|
||||
inline u64 get_tsc()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
return __rdtsc();
|
||||
#else
|
||||
return __builtin_ia32_rdtsc();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Transaction helper (result = pair of success and op result, or just bool)
|
||||
template <typename F, typename R = std::invoke_result_t<F>>
|
||||
inline auto tx_start(F op)
|
||||
{
|
||||
uint status = -1;
|
||||
|
||||
for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc())
|
||||
for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc())
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
|
||||
#else
|
||||
status = _xbegin();
|
||||
|
||||
if (status != _XBEGIN_STARTED) [[unlikely]]
|
||||
if (status != umax) [[unlikely]]
|
||||
{
|
||||
goto retry;
|
||||
}
|
||||
@ -80,7 +112,7 @@ namespace utils
|
||||
const void* ptr = reinterpret_cast<const void*>(value);
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T1);
|
||||
return _mm_prefetch(reinterpret_cast<const char*>(ptr), 2);
|
||||
#else
|
||||
return __builtin_prefetch(ptr, 0, 2);
|
||||
#endif
|
||||
@ -95,7 +127,7 @@ namespace utils
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
|
||||
return _mm_prefetch(reinterpret_cast<const char*>(ptr), 3);
|
||||
#else
|
||||
return __builtin_prefetch(ptr, 0, 3);
|
||||
#endif
|
||||
@ -108,7 +140,11 @@ namespace utils
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _m_prefetchw(ptr);
|
||||
#else
|
||||
return __builtin_prefetch(ptr, 1, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
constexpr u8 rol8(u8 x, u8 n)
|
||||
@ -120,8 +156,10 @@ namespace utils
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _rotl8(x, n);
|
||||
#elif defined(__clang__)
|
||||
return __builtin_rotateleft8(x, n);
|
||||
#else
|
||||
return __rolb(x, n);
|
||||
return __builtin_ia32_rolqi(x, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -133,9 +171,11 @@ namespace utils
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _rotl16(x, n);
|
||||
return _rotl16(x, static_cast<uchar>(n));
|
||||
#elif defined(__clang__)
|
||||
return __builtin_rotateleft16(x, n);
|
||||
#else
|
||||
return __rolw(x, n);
|
||||
return __builtin_ia32_rolhi(x, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -148,22 +188,10 @@ namespace utils
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _rotl(x, n);
|
||||
#elif defined(__clang__)
|
||||
return __builtin_rotateleft32(x, n);
|
||||
#else
|
||||
return __rold(x, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
constexpr u32 ror32(u32 x, u32 n)
|
||||
{
|
||||
if (std::is_constant_evaluated())
|
||||
{
|
||||
return (x >> (n & 31)) | (x << (((0 - n) & 31)));
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _rotr(x, n);
|
||||
#else
|
||||
return __rord(x, n);
|
||||
return (x << n) | (x >> (32 - n));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -176,8 +204,10 @@ namespace utils
|
||||
|
||||
#ifdef _MSC_VER
|
||||
return _rotl64(x, static_cast<int>(n));
|
||||
#elif defined(__clang__)
|
||||
return __builtin_rotateleft64(x, n);
|
||||
#else
|
||||
return __rolq(x, static_cast<int>(n));
|
||||
return (x << n) | (x >> (64 - n));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -285,12 +315,21 @@ namespace utils
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void pause()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
_mm_pause();
|
||||
#else
|
||||
__builtin_ia32_pause();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Synchronization helper (cache-friendly busy waiting)
|
||||
inline void busy_wait(usz cycles = 3000)
|
||||
{
|
||||
const u64 start = __rdtsc();
|
||||
do _mm_pause();
|
||||
while (__rdtsc() - start < cycles);
|
||||
const u64 start = get_tsc();
|
||||
do pause();
|
||||
while (get_tsc() - start < cycles);
|
||||
}
|
||||
|
||||
// Align to power of 2
|
||||
|
@ -6,6 +6,25 @@
|
||||
#define USE_STD
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#include "emmintrin.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
namespace utils
|
||||
{
|
||||
u128 __vectorcall atomic_load16(const void* ptr)
|
||||
{
|
||||
return std::bit_cast<u128>(_mm_load_si128((__m128i*)ptr));
|
||||
}
|
||||
|
||||
void __vectorcall atomic_store16(void* ptr, u128 value)
|
||||
{
|
||||
_mm_store_si128((__m128i*)ptr, std::bit_cast<__m128i>(value));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#include "Utilities/sync.h"
|
||||
#include "Utilities/StrFmt.h"
|
||||
|
||||
@ -847,9 +866,17 @@ namespace
|
||||
};
|
||||
}
|
||||
|
||||
u64 atomic_wait::get_unique_tsc()
|
||||
#ifdef _MSC_VER
|
||||
extern "C" u64 __rdtsc();
|
||||
#endif
|
||||
|
||||
u64 utils::get_unique_tsc()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
const u64 stamp0 = __rdtsc();
|
||||
#else
|
||||
const u64 stamp0 = __builtin_ia32_rdtsc();
|
||||
#endif
|
||||
|
||||
return s_min_tsc.atomic_op([&](u64& tsc)
|
||||
{
|
||||
@ -1026,7 +1053,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, u12
|
||||
|
||||
SAFE_BUFFERS void atomic_wait_engine::wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext)
|
||||
{
|
||||
const auto stamp0 = atomic_wait::get_unique_tsc();
|
||||
const auto stamp0 = utils::get_unique_tsc();
|
||||
|
||||
if (!s_tls_wait_cb(data, 0, stamp0))
|
||||
{
|
||||
|
@ -7,6 +7,62 @@
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
extern "C"
|
||||
{
|
||||
void _ReadWriteBarrier();
|
||||
void* _AddressOfReturnAddress();
|
||||
|
||||
uchar _bittest(const long*, long);
|
||||
uchar _interlockedbittestandset(volatile long*, long);
|
||||
uchar _interlockedbittestandreset(volatile long*, long);
|
||||
|
||||
char _InterlockedCompareExchange8(volatile char*, char, char);
|
||||
char _InterlockedExchange8(volatile char*, char);
|
||||
char _InterlockedExchangeAdd8(volatile char*, char);
|
||||
char _InterlockedAnd8(volatile char*, char);
|
||||
char _InterlockedOr8(volatile char*, char);
|
||||
char _InterlockedXor8(volatile char*, char);
|
||||
|
||||
short _InterlockedCompareExchange16(volatile short*, short, short);
|
||||
short _InterlockedExchange16(volatile short*, short);
|
||||
short _InterlockedExchangeAdd16(volatile short*, short);
|
||||
short _InterlockedAnd16(volatile short*, short);
|
||||
short _InterlockedOr16(volatile short*, short);
|
||||
short _InterlockedXor16(volatile short*, short);
|
||||
short _InterlockedIncrement16(volatile short*);
|
||||
short _InterlockedDecrement16(volatile short*);
|
||||
|
||||
long _InterlockedCompareExchange(volatile long*, long, long);
|
||||
long _InterlockedCompareExchange_HLEAcquire(volatile long*, long, long);
|
||||
long _InterlockedExchange(volatile long*, long);
|
||||
long _InterlockedExchangeAdd(volatile long*, long);
|
||||
long _InterlockedExchangeAdd_HLERelease(volatile long*, long);
|
||||
long _InterlockedAnd(volatile long*, long);
|
||||
long _InterlockedOr(volatile long*, long);
|
||||
long _InterlockedXor(volatile long*, long);
|
||||
long _InterlockedIncrement(volatile long*);
|
||||
long _InterlockedDecrement(volatile long*);
|
||||
|
||||
s64 _InterlockedCompareExchange64(volatile s64*, s64, s64);
|
||||
s64 _InterlockedCompareExchange64_HLEAcquire(volatile s64*, s64, s64);
|
||||
s64 _InterlockedExchange64(volatile s64*, s64);
|
||||
s64 _InterlockedExchangeAdd64(volatile s64*, s64);
|
||||
s64 _InterlockedExchangeAdd64_HLERelease(volatile s64*, s64);
|
||||
s64 _InterlockedAnd64(volatile s64*, s64);
|
||||
s64 _InterlockedOr64(volatile s64*, s64);
|
||||
s64 _InterlockedXor64(volatile s64*, s64);
|
||||
s64 _InterlockedIncrement64(volatile s64*);
|
||||
s64 _InterlockedDecrement64(volatile s64*);
|
||||
|
||||
uchar _InterlockedCompareExchange128(volatile s64*, s64, s64, s64*);
|
||||
}
|
||||
|
||||
namespace utils
|
||||
{
|
||||
u128 __vectorcall atomic_load16(const void*);
|
||||
void __vectorcall atomic_store16(void*, u128);
|
||||
}
|
||||
#endif
|
||||
|
||||
FORCE_INLINE void atomic_fence_consume()
|
||||
@ -238,7 +294,10 @@ namespace atomic_wait
|
||||
|
||||
template <typename... T, typename = std::void_t<decltype(std::declval<T>().template wait<op::eq>(any_value))...>>
|
||||
list(T&... vars) -> list<sizeof...(T), T...>;
|
||||
}
|
||||
|
||||
namespace utils
|
||||
{
|
||||
// RDTSC with adjustment for being unique
|
||||
u64 get_unique_tsc();
|
||||
}
|
||||
@ -871,18 +930,14 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
|
||||
static inline T load(const T& dest)
|
||||
{
|
||||
atomic_fence_acquire();
|
||||
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
|
||||
u128 val = utils::atomic_load16(&dest);
|
||||
atomic_fence_acquire();
|
||||
return std::bit_cast<T>(val);
|
||||
}
|
||||
|
||||
static inline T observe(const T& dest)
|
||||
{
|
||||
// Barriers are kept intentionally
|
||||
atomic_fence_acquire();
|
||||
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
|
||||
atomic_fence_acquire();
|
||||
return std::bit_cast<T>(val);
|
||||
return load(dest);
|
||||
}
|
||||
|
||||
static inline bool compare_exchange(T& dest, T& comp, T exch)
|
||||
@ -906,32 +961,31 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
|
||||
static inline void store(T& dest, T value)
|
||||
{
|
||||
atomic_fence_acq_rel();
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
|
||||
release(dest, value);
|
||||
atomic_fence_seq_cst();
|
||||
}
|
||||
|
||||
static inline void release(T& dest, T value)
|
||||
{
|
||||
atomic_fence_release();
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
|
||||
utils::atomic_store16(&dest, std::bit_cast<u128>(value));
|
||||
atomic_fence_release();
|
||||
}
|
||||
#else
|
||||
static inline T load(const T& dest)
|
||||
{
|
||||
__atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
|
||||
__atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
return std::bit_cast<T>(val);
|
||||
__m128i r;
|
||||
#ifdef __AVX__
|
||||
__asm__ volatile("vmovdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory");
|
||||
#else
|
||||
__asm__ volatile("movdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory");
|
||||
#endif
|
||||
return std::bit_cast<T>(r);
|
||||
}
|
||||
|
||||
static inline T observe(const T& dest)
|
||||
{
|
||||
// Barriers are kept intentionally
|
||||
__atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
__m128i val = _mm_load_si128(reinterpret_cast<const __m128i*>(&dest));
|
||||
__atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
return std::bit_cast<T>(val);
|
||||
return load(dest);
|
||||
}
|
||||
|
||||
static inline bool compare_exchange(T& dest, T& comp, T exch)
|
||||
@ -987,16 +1041,17 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
|
||||
|
||||
static inline void store(T& dest, T value)
|
||||
{
|
||||
__atomic_thread_fence(__ATOMIC_ACQ_REL);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
|
||||
release(dest, value);
|
||||
atomic_fence_seq_cst();
|
||||
}
|
||||
|
||||
static inline void release(T& dest, T value)
|
||||
{
|
||||
__atomic_thread_fence(__ATOMIC_RELEASE);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value));
|
||||
__atomic_thread_fence(__ATOMIC_RELEASE);
|
||||
#ifdef __AVX__
|
||||
__asm__ volatile("vmovdqa %0, %1;" :: "x" (reinterpret_cast<__m128i&>(value)), "m" (dest) : "memory");
|
||||
#else
|
||||
__asm__ volatile("movdqa %0, %1;" :: "x" (reinterpret_cast<__m128i&>(value)), "m" (dest) : "memory");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -1,532 +0,0 @@
|
||||
#include "atomic2.hpp"
|
||||
#include "Utilities/JIT.h"
|
||||
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
//
|
||||
static const bool s_use_rtm = utils::has_rtm();
|
||||
|
||||
template <unsigned Count>
|
||||
static const auto commit_tx = build_function_asm<s32(*)(const stx::multi_cas_item*)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
static_assert(Count <= 8);
|
||||
using namespace asmjit;
|
||||
|
||||
// Fill registers with item data
|
||||
c.lea(x86::rax, x86::qword_ptr(args[0], 120));
|
||||
|
||||
if constexpr (Count >= 1)
|
||||
{
|
||||
c.mov(x86::rcx, x86::qword_ptr(x86::rax, -120));
|
||||
c.mov(x86::rdx, x86::qword_ptr(x86::rax, -112));
|
||||
c.mov(x86::r8, x86::qword_ptr(x86::rax, -104));
|
||||
}
|
||||
if constexpr (Count >= 2)
|
||||
{
|
||||
c.mov(x86::r9, x86::qword_ptr(x86::rax, -96));
|
||||
c.mov(x86::r10, x86::qword_ptr(x86::rax, -88));
|
||||
c.mov(x86::r11, x86::qword_ptr(x86::rax, -80));
|
||||
}
|
||||
if constexpr (Count >= 3)
|
||||
{
|
||||
if (utils::has_avx())
|
||||
{
|
||||
c.vzeroupper();
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
c.push(x86::rsi);
|
||||
#endif
|
||||
c.mov(x86::rsi, x86::qword_ptr(x86::rax, -72));
|
||||
c.movups(x86::xmm0, x86::oword_ptr(x86::rax, -64));
|
||||
}
|
||||
if constexpr (Count >= 4)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
c.push(x86::rdi);
|
||||
#endif
|
||||
c.mov(x86::rdi, x86::qword_ptr(x86::rax, -48));
|
||||
c.movups(x86::xmm1, x86::oword_ptr(x86::rax, -40));
|
||||
}
|
||||
if constexpr (Count >= 5)
|
||||
{
|
||||
c.push(x86::rbx);
|
||||
c.mov(x86::rbx, x86::qword_ptr(x86::rax, -24));
|
||||
c.movups(x86::xmm2, x86::oword_ptr(x86::rax, -16));
|
||||
}
|
||||
if constexpr (Count >= 6)
|
||||
{
|
||||
c.push(x86::rbp);
|
||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
||||
c.movups(x86::xmm3, x86::oword_ptr(x86::rax, 8));
|
||||
}
|
||||
if constexpr (Count >= 7)
|
||||
{
|
||||
c.push(x86::r12);
|
||||
c.mov(x86::r12, x86::qword_ptr(x86::rax, 24));
|
||||
c.movups(x86::xmm4, x86::oword_ptr(x86::rax, 32));
|
||||
}
|
||||
if constexpr (Count >= 8)
|
||||
{
|
||||
c.push(x86::r13);
|
||||
c.mov(x86::r13, x86::qword_ptr(x86::rax, 48));
|
||||
c.movups(x86::xmm5, x86::oword_ptr(x86::rax, 56));
|
||||
}
|
||||
|
||||
// Begin transaction
|
||||
Label begin = c.newLabel();
|
||||
Label fall = c.newLabel();
|
||||
Label stop = c.newLabel();
|
||||
Label wait = c.newLabel();
|
||||
Label ret = c.newLabel();
|
||||
c.bind(begin);
|
||||
c.xbegin(fall);
|
||||
|
||||
// Compare phase
|
||||
if constexpr (Count >= 1)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::rcx), x86::rdx);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 2)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::r9), x86::r10);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 3)
|
||||
{
|
||||
c.movq(x86::rax, x86::xmm0);
|
||||
c.cmp(x86::qword_ptr(x86::rsi), x86::rax);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 4)
|
||||
{
|
||||
c.movq(x86::rax, x86::xmm1);
|
||||
c.cmp(x86::qword_ptr(x86::rdi), x86::rax);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 5)
|
||||
{
|
||||
c.movq(x86::rax, x86::xmm2);
|
||||
c.cmp(x86::qword_ptr(x86::rbx), x86::rax);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 6)
|
||||
{
|
||||
c.movq(x86::rax, x86::xmm3);
|
||||
c.cmp(x86::qword_ptr(x86::rbp), x86::rax);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 7)
|
||||
{
|
||||
c.movq(x86::rax, x86::xmm4);
|
||||
c.cmp(x86::qword_ptr(x86::r12), x86::rax);
|
||||
c.jne(stop);
|
||||
}
|
||||
if constexpr (Count >= 8)
|
||||
{
|
||||
c.movq(x86::rax, x86::xmm5);
|
||||
c.cmp(x86::qword_ptr(x86::r13), x86::rax);
|
||||
c.jne(stop);
|
||||
}
|
||||
|
||||
// Check for transactions in progress
|
||||
if constexpr (Count >= 1)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::rcx, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 2)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::r9, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 3)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::rsi, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 4)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::rdi, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 5)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::rbx, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 6)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::rbp, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 7)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::r12, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
if constexpr (Count >= 8)
|
||||
{
|
||||
c.cmp(x86::qword_ptr(x86::r13, 8), 0);
|
||||
c.jne(wait);
|
||||
}
|
||||
|
||||
// Write phase
|
||||
if constexpr (Count >= 1)
|
||||
c.mov(x86::qword_ptr(x86::rcx), x86::r8);
|
||||
if constexpr (Count >= 2)
|
||||
c.mov(x86::qword_ptr(x86::r9), x86::r11);
|
||||
if constexpr (Count >= 3)
|
||||
c.movhps(x86::qword_ptr(x86::rsi), x86::xmm0);
|
||||
if constexpr (Count >= 4)
|
||||
c.movhps(x86::qword_ptr(x86::rdi), x86::xmm1);
|
||||
if constexpr (Count >= 5)
|
||||
c.movhps(x86::qword_ptr(x86::rbx), x86::xmm2);
|
||||
if constexpr (Count >= 6)
|
||||
c.movhps(x86::qword_ptr(x86::rbp), x86::xmm3);
|
||||
if constexpr (Count >= 7)
|
||||
c.movhps(x86::qword_ptr(x86::r12), x86::xmm4);
|
||||
if constexpr (Count >= 8)
|
||||
c.movhps(x86::qword_ptr(x86::r13), x86::xmm5);
|
||||
|
||||
// End transaction (success)
|
||||
c.xend();
|
||||
c.mov(x86::eax, 1);
|
||||
c.bind(ret);
|
||||
if constexpr (Count >= 8)
|
||||
c.pop(x86::r13);
|
||||
if constexpr (Count >= 7)
|
||||
c.pop(x86::r12);
|
||||
if constexpr (Count >= 6)
|
||||
c.pop(x86::rbp);
|
||||
if constexpr (Count >= 5)
|
||||
c.pop(x86::rbx);
|
||||
#ifdef _WIN32
|
||||
if constexpr (Count >= 4)
|
||||
c.pop(x86::rdi);
|
||||
if constexpr (Count >= 3)
|
||||
c.pop(x86::rsi);
|
||||
#endif
|
||||
c.ret();
|
||||
|
||||
// Transaction abort
|
||||
c.bind(stop);
|
||||
c.xend();
|
||||
c.xor_(x86::eax, x86::eax);
|
||||
c.jmp(fall);
|
||||
|
||||
// Abort when there is still a chance of success
|
||||
c.bind(wait);
|
||||
c.xend();
|
||||
c.mov(x86::eax, 0xffu << 24);
|
||||
c.jmp(fall);
|
||||
|
||||
// Transaction fallback: return zero
|
||||
c.bind(fall);
|
||||
c.test(x86::eax, _XABORT_RETRY);
|
||||
c.jnz(begin);
|
||||
c.sar(x86::eax, 24);
|
||||
c.jmp(ret);
|
||||
});
|
||||
|
||||
// 4095 records max
|
||||
static constexpr u64 s_rec_gcount = 4096 / 64;
|
||||
|
||||
// Global record pool
|
||||
static stx::multi_cas_record s_records[s_rec_gcount * 64]{};
|
||||
|
||||
// Allocation bits (without first element)
|
||||
static atomic_t<u64> s_rec_bits[s_rec_gcount]{1};
|
||||
|
||||
static constexpr u64 s_state_mask = 3;
|
||||
static constexpr u64 s_state_undef = 0;
|
||||
static constexpr u64 s_state_failure = 1;
|
||||
static constexpr u64 s_state_success = 2;
|
||||
static constexpr u64 s_ref_mask = ~s_state_mask;
|
||||
static constexpr u64 s_ref_one = s_state_mask + 1;
|
||||
|
||||
static u64 rec_alloc()
|
||||
{
|
||||
const u32 start = static_cast<u32>(__rdtsc());
|
||||
|
||||
for (u32 i = 0;; i++)
|
||||
{
|
||||
const u32 group = (i + start) % s_rec_gcount;
|
||||
|
||||
const auto [bits, ok] = s_rec_bits[group].fetch_op([](u64& bits)
|
||||
{
|
||||
if (~bits)
|
||||
{
|
||||
// Set lowest clear bit
|
||||
bits |= bits + 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
if (ok)
|
||||
{
|
||||
// Find lowest clear bit
|
||||
return group * 64 + std::countr_one(bits);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: unreachable
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool cmpxchg16(s64(&dest)[2], s64(&cmp_res)[2], s64 exch_high, s64 exch_low)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
return !!_InterlockedCompareExchange128(dest, exch_high, exch_low, cmp_res);
|
||||
#else
|
||||
s64 exch[2]{exch_low, exch_high};
|
||||
return __atomic_compare_exchange(&dest, &cmp_res, &exch, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool stx::multi_cas_record::commit() const noexcept
|
||||
{
|
||||
// Transaction cancelled
|
||||
if (m_count == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static auto rec_unref = [](u64 id)
|
||||
{
|
||||
if (id && id < s_rec_gcount * 64)
|
||||
{
|
||||
auto [_, ok] = s_records[id].m_state.fetch_op([](u64& state)
|
||||
{
|
||||
if (state < s_ref_one)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
state -= s_ref_one;
|
||||
|
||||
if (state < s_ref_one)
|
||||
{
|
||||
state = 0;
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 1;
|
||||
});
|
||||
|
||||
if (ok > 1)
|
||||
{
|
||||
s_rec_bits[id / 64] &= ~(u64{1} << (id % 64));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function to complete successful transaction
|
||||
static auto rec_complete = [](u64 id)
|
||||
{
|
||||
for (u32 i = 0; i < s_records[id].m_count; i++)
|
||||
{
|
||||
auto& item = s_records[id].m_list[i];
|
||||
|
||||
atomic2 cmp;
|
||||
cmp.m_data[0] = item.m_old;
|
||||
cmp.m_data[1] = id;
|
||||
|
||||
if (item.m_addr->load() == item.m_old && atomic_storage<s64>::load(item.m_addr->m_data[1]) == static_cast<s64>(id))
|
||||
{
|
||||
if (cmpxchg16(item.m_addr->m_data, cmp.m_data, 0, item.m_new))
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function to deal with existing transaction
|
||||
static auto rec_try_abort = [](u64 id) -> u64
|
||||
{
|
||||
if (id >= s_rec_gcount * 64)
|
||||
{
|
||||
std::abort();
|
||||
}
|
||||
|
||||
auto [_old, ok] = s_records[id].m_state.fetch_op([](u64& state)
|
||||
{
|
||||
if (state < s_ref_one)
|
||||
{
|
||||
// Don't reference if no references
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((state & s_state_mask) == s_state_undef)
|
||||
{
|
||||
// Break transaction if possible
|
||||
state |= s_state_failure;
|
||||
}
|
||||
|
||||
state += s_ref_one;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!ok)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((_old & s_state_mask) != s_state_success)
|
||||
{
|
||||
// Allow to overwrite failing transaction
|
||||
return id;
|
||||
}
|
||||
|
||||
// Help to complete
|
||||
rec_complete(id);
|
||||
rec_unref(id);
|
||||
return 0;
|
||||
};
|
||||
|
||||
// Single CAS path
|
||||
if (m_count == 1)
|
||||
{
|
||||
atomic2 cmp;
|
||||
|
||||
while (auto ptr = m_list[0].m_addr)
|
||||
{
|
||||
if (ptr->load() != m_list[0].m_old)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
cmp.m_data[0] = m_list[0].m_old;
|
||||
cmp.m_data[1] = atomic_storage<s64>::load(ptr->m_data[1]);
|
||||
|
||||
if (!cmp.m_data[1] && cmpxchg16(ptr->m_data, cmp.m_data, 0, m_list[0].m_new))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else if (cmp.m_data[0] != static_cast<s64>(m_list[0].m_old))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
else if (cmp.m_data[1])
|
||||
{
|
||||
if (u64 _id = rec_try_abort(cmp.m_data[1]))
|
||||
{
|
||||
if (cmpxchg16(ptr->m_data, cmp.m_data, 0, m_list[0].m_new))
|
||||
{
|
||||
rec_unref(_id);
|
||||
return true;
|
||||
}
|
||||
|
||||
rec_unref(_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unreachable
|
||||
std::abort();
|
||||
}
|
||||
|
||||
// Try TSX if available
|
||||
if (s_use_rtm)
|
||||
{
|
||||
switch (m_count)
|
||||
{
|
||||
case 2: if (s32 r = commit_tx<2>(m_list)) return r > 0; break;
|
||||
case 3: if (s32 r = commit_tx<3>(m_list)) return r > 0; break;
|
||||
case 4: if (s32 r = commit_tx<4>(m_list)) return r > 0; break;
|
||||
case 5: if (s32 r = commit_tx<5>(m_list)) return r > 0; break;
|
||||
case 6: if (s32 r = commit_tx<6>(m_list)) return r > 0; break;
|
||||
case 7: if (s32 r = commit_tx<7>(m_list)) return r > 0; break;
|
||||
case 8: if (s32 r = commit_tx<8>(m_list)) return r > 0; break;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate global record and copy data
|
||||
const u64 id = rec_alloc();
|
||||
|
||||
for (u32 i = 0; i < (m_count + 1) / 2; i++)
|
||||
{
|
||||
std::memcpy(s_records[id].m_list + i * 2, m_list + i * 2, sizeof(multi_cas_item) * 2);
|
||||
}
|
||||
|
||||
s_records[id].m_count = m_count;
|
||||
s_records[id].m_state = s_ref_one;
|
||||
|
||||
// Try to install CAS items
|
||||
for (u32 i = 0; i < m_count && (s_records[id].m_state & s_state_mask) == s_state_undef; i++)
|
||||
{
|
||||
atomic2 cmp;
|
||||
|
||||
while (auto ptr = m_list[i].m_addr)
|
||||
{
|
||||
if (ptr->load() != m_list[i].m_old)
|
||||
{
|
||||
s_records[id].m_state |= s_state_failure;
|
||||
break;
|
||||
}
|
||||
|
||||
cmp.m_data[0] = m_list[i].m_old;
|
||||
cmp.m_data[1] = atomic_storage<s64>::load(ptr->m_data[1]);
|
||||
|
||||
if (!cmp.m_data[1] && cmpxchg16(ptr->m_data, cmp.m_data, id, m_list[i].m_old))
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if (cmp.m_data[0] != static_cast<s64>(m_list[i].m_old))
|
||||
{
|
||||
s_records[id].m_state |= s_state_failure;
|
||||
break;
|
||||
}
|
||||
else if (cmp.m_data[1])
|
||||
{
|
||||
if (u64 _id = rec_try_abort(cmp.m_data[1]))
|
||||
{
|
||||
if (cmpxchg16(ptr->m_data, cmp.m_data, id, m_list[i].m_old))
|
||||
{
|
||||
rec_unref(_id);
|
||||
break;
|
||||
}
|
||||
|
||||
rec_unref(_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to acknowledge transaction success
|
||||
auto [_, ok] = s_records[id].m_state.fetch_op([](u64& state)
|
||||
{
|
||||
if (state & s_state_failure)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
state |= s_state_success;
|
||||
return true;
|
||||
});
|
||||
|
||||
// Complete transaction on success, or cleanup on failure
|
||||
for (u32 i = 0; i < m_count; i++)
|
||||
{
|
||||
auto& item = m_list[i];
|
||||
|
||||
atomic2 cmp;
|
||||
cmp.m_data[0] = item.m_old;
|
||||
cmp.m_data[1] = id;
|
||||
|
||||
if (item.m_addr->load() == item.m_old && atomic_storage<s64>::load(item.m_addr->m_data[1]) == static_cast<s64>(id))
|
||||
{
|
||||
// Restore old or set new
|
||||
if (cmpxchg16(item.m_addr->m_data, cmp.m_data, 0, ok ? item.m_new : item.m_old))
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rec_unref(id);
|
||||
return ok;
|
||||
}
|
@ -1,156 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include "util/atomic.hpp"
|
||||
|
||||
namespace stx
|
||||
{
|
||||
// Unsigned 64-bit atomic for multi-cas (occupies 128 bits)
|
||||
class alignas(16) atomic2
|
||||
{
|
||||
// First 64-bit value is an actual value, second one is an allocated control block pointer (if not zero)
|
||||
s64 m_data[2]{};
|
||||
|
||||
friend class multi_cas_record;
|
||||
|
||||
public:
|
||||
// Can't be really uninitialized or it'll be fundamentally broken
|
||||
constexpr atomic2() noexcept = default;
|
||||
|
||||
atomic2(const atomic2&) = delete;
|
||||
|
||||
atomic2& operator=(const atomic2&) = delete;
|
||||
|
||||
constexpr atomic2(u64 value) noexcept
|
||||
: m_data{static_cast<s64>(value), s64{0}}
|
||||
{
|
||||
}
|
||||
|
||||
// Simply observe the state
|
||||
u64 load() const noexcept
|
||||
{
|
||||
return atomic_storage<u64>::load(m_data[0]);
|
||||
}
|
||||
|
||||
// void wait(u64 old_value) const noexcept;
|
||||
// void notify_one() noexcept;
|
||||
// void notify_all() noexcept;
|
||||
};
|
||||
|
||||
// Atomic CAS item
|
||||
class multi_cas_item
|
||||
{
|
||||
atomic2* m_addr;
|
||||
u64 m_old;
|
||||
u64 m_new;
|
||||
|
||||
friend class multi_cas_record;
|
||||
|
||||
public:
|
||||
multi_cas_item() noexcept = default;
|
||||
|
||||
multi_cas_item(const multi_cas_item&) = delete;
|
||||
|
||||
multi_cas_item& operator=(const multi_cas_item&) = delete;
|
||||
|
||||
u64 get_old() const noexcept
|
||||
{
|
||||
return m_old;
|
||||
}
|
||||
|
||||
operator u64() const noexcept
|
||||
{
|
||||
return m_new;
|
||||
}
|
||||
|
||||
void operator=(u64 value) noexcept
|
||||
{
|
||||
m_new = value;
|
||||
}
|
||||
};
|
||||
|
||||
// An object passed to multi_cas lambda
|
||||
class alignas(64) multi_cas_record
|
||||
{
|
||||
// Ref counter and Multi-CAS state
|
||||
atomic_t<u64> m_state;
|
||||
|
||||
// Total number of CASes
|
||||
u64 m_count;
|
||||
|
||||
// Support up to 10 CASes
|
||||
multi_cas_item m_list[10];
|
||||
|
||||
public:
|
||||
// Read atomic value and allocate "writable" item
|
||||
multi_cas_item& load(atomic2& atom) noexcept
|
||||
{
|
||||
if (m_count >= std::size(m_list))
|
||||
{
|
||||
std::abort();
|
||||
}
|
||||
|
||||
auto& r = m_list[m_count++];
|
||||
r.m_addr = &atom;
|
||||
r.m_old = atom.load();
|
||||
r.m_new = r.m_old;
|
||||
return r;
|
||||
}
|
||||
|
||||
// Reset transaction (invalidates item references)
|
||||
void cancel() noexcept
|
||||
{
|
||||
m_count = 0;
|
||||
}
|
||||
|
||||
// Try to commit sudoku (don't call)
|
||||
bool commit() const noexcept;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct multi_cas_result
|
||||
{
|
||||
static constexpr bool is_void = false;
|
||||
|
||||
T ret;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct multi_cas_result<void>
|
||||
{
|
||||
static constexpr bool is_void = true;
|
||||
};
|
||||
|
||||
template <typename Context>
|
||||
class multi_cas final : Context, multi_cas_record, public multi_cas_result<std::invoke_result_t<Context, multi_cas_record&>>
|
||||
{
|
||||
using result = multi_cas_result<std::invoke_result_t<Context, multi_cas_record&>>;
|
||||
using record = multi_cas_record;
|
||||
|
||||
public:
|
||||
// Implicit deduction guide candidate constructor (for lambda)
|
||||
multi_cas(Context&& f) noexcept
|
||||
: Context(std::forward<Context>(f))
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
multi_cas_record& rec = *this;
|
||||
record::cancel();
|
||||
|
||||
if constexpr (result::is_void)
|
||||
{
|
||||
Context::operator()(rec);
|
||||
}
|
||||
else
|
||||
{
|
||||
result::ret = Context::operator()(rec);
|
||||
}
|
||||
|
||||
if (record::commit())
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
@ -17,6 +17,13 @@
|
||||
|
||||
#include "util/asm.hpp"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C"
|
||||
{
|
||||
u64 _xgetbv(u32);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline std::array<u32, 4> utils::get_cpuid(u32 func, u32 subfunc)
|
||||
{
|
||||
int regs[4];
|
||||
@ -303,6 +310,19 @@ static constexpr ullong round_tsc(ullong val)
|
||||
return utils::rounded_div(val, 1'000'000) * 1'000'000;
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
extern "C" void _mm_lfence();
|
||||
#endif
|
||||
|
||||
static inline void lfence()
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
_mm_lfence();
|
||||
#else
|
||||
__builtin_ia32_lfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
ullong utils::get_tsc_freq()
|
||||
{
|
||||
static const ullong cal_tsc = []() -> ullong
|
||||
@ -343,17 +363,17 @@ ullong utils::get_tsc_freq()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
Sleep(1);
|
||||
error_data[i] = (_mm_lfence(), __rdtsc());
|
||||
error_data[i] = (lfence(), utils::get_tsc());
|
||||
LARGE_INTEGER ctr;
|
||||
QueryPerformanceCounter(&ctr);
|
||||
rdtsc_data[i] = (_mm_lfence(), __rdtsc());
|
||||
rdtsc_data[i] = (lfence(), utils::get_tsc());
|
||||
timer_data[i] = ctr.QuadPart;
|
||||
#else
|
||||
usleep(200);
|
||||
error_data[i] = (_mm_lfence(), __rdtsc());
|
||||
error_data[i] = (lfence(), utils::get_tsc());
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
rdtsc_data[i] = (_mm_lfence(), __rdtsc());
|
||||
rdtsc_data[i] = (lfence(), utils::get_tsc());
|
||||
timer_data[i] = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000;
|
||||
#endif
|
||||
}
|
||||
|
@ -1,13 +1,5 @@
|
||||
#pragma once // No BOM and only basic ASCII in this header, or a neko will die
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#include <immintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
@ -278,10 +270,28 @@ public:
|
||||
};
|
||||
|
||||
#ifndef _MSC_VER
|
||||
|
||||
using u128 = __uint128_t;
|
||||
using s128 = __int128_t;
|
||||
|
||||
using __m128i = long long __attribute__((vector_size(16)));
|
||||
using __m128d = double __attribute__((vector_size(16)));
|
||||
using __m128 = float __attribute__((vector_size(16)));
|
||||
|
||||
#else
|
||||
|
||||
extern "C"
|
||||
{
|
||||
union __m128;
|
||||
union __m128i;
|
||||
struct __m128d;
|
||||
|
||||
uchar _addcarry_u64(uchar, u64, u64, u64*);
|
||||
uchar _subborrow_u64(uchar, u64, u64, u64*);
|
||||
u64 __shiftleft128(u64, u64, uchar);
|
||||
u64 __shiftright128(u64, u64, uchar);
|
||||
}
|
||||
|
||||
// Unsigned 128-bit integer implementation (TODO)
|
||||
struct alignas(16) u128
|
||||
{
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once // No BOM and only basic ASCII in this header, or a neko will die
|
||||
|
||||
#include "util/types.hpp"
|
||||
#include <cmath>
|
||||
|
||||
// 128-bit vector type
|
||||
union alignas(16) v128
|
||||
@ -12,17 +11,17 @@ union alignas(16) v128
|
||||
template <typename T, usz N, usz M>
|
||||
struct masked_array_t // array type accessed as (index ^ M)
|
||||
{
|
||||
char m_data[16];
|
||||
T m_data[N];
|
||||
|
||||
public:
|
||||
T& operator[](usz index)
|
||||
{
|
||||
return reinterpret_cast<T*>(m_data)[index ^ M];
|
||||
return m_data[index ^ M];
|
||||
}
|
||||
|
||||
const T& operator[](usz index) const
|
||||
{
|
||||
return reinterpret_cast<const T*>(m_data)[index ^ M];
|
||||
return m_data[index ^ M];
|
||||
}
|
||||
};
|
||||
|
||||
@ -56,88 +55,55 @@ union alignas(16) v128
|
||||
reversed_array_t<f32> fr;
|
||||
reversed_array_t<f64> dr;
|
||||
|
||||
u128 _u;
|
||||
//s128 _s;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
template <typename T>
|
||||
struct opaque_wrapper
|
||||
{
|
||||
u128 m_data;
|
||||
|
||||
opaque_wrapper() = default;
|
||||
|
||||
opaque_wrapper(const T& value)
|
||||
: m_data(std::bit_cast<u128>(value))
|
||||
{
|
||||
}
|
||||
|
||||
opaque_wrapper& operator=(const T& value)
|
||||
{
|
||||
m_data = std::bit_cast<u128>(value);
|
||||
return *this;
|
||||
}
|
||||
|
||||
operator T() const
|
||||
{
|
||||
return std::bit_cast<T>(m_data);
|
||||
}
|
||||
};
|
||||
|
||||
opaque_wrapper<__m128> vf;
|
||||
opaque_wrapper<__m128i> vi;
|
||||
opaque_wrapper<__m128d> vd;
|
||||
#else
|
||||
__m128 vf;
|
||||
__m128i vi;
|
||||
__m128d vd;
|
||||
#endif
|
||||
|
||||
struct bit_array_128
|
||||
{
|
||||
char m_data[16];
|
||||
|
||||
public:
|
||||
class bit_element
|
||||
{
|
||||
u64& data;
|
||||
const u64 mask;
|
||||
|
||||
public:
|
||||
bit_element(u64& data, const u64 mask)
|
||||
: data(data)
|
||||
, mask(mask)
|
||||
{
|
||||
}
|
||||
|
||||
operator bool() const
|
||||
{
|
||||
return (data & mask) != 0;
|
||||
}
|
||||
|
||||
bit_element& operator=(const bool right)
|
||||
{
|
||||
if (right)
|
||||
{
|
||||
data |= mask;
|
||||
}
|
||||
else
|
||||
{
|
||||
data &= ~mask;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
bit_element& operator=(const bit_element& right)
|
||||
{
|
||||
if (right)
|
||||
{
|
||||
data |= mask;
|
||||
}
|
||||
else
|
||||
{
|
||||
data &= ~mask;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
class bit_element;
|
||||
|
||||
// Index 0 returns the MSB and index 127 returns the LSB
|
||||
bit_element operator[](u32 index)
|
||||
{
|
||||
const auto data_ptr = reinterpret_cast<u64*>(m_data);
|
||||
|
||||
if constexpr (std::endian::little == std::endian::native)
|
||||
{
|
||||
return bit_element(data_ptr[1 - (index >> 6)], 0x8000000000000000ull >> (index & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
return bit_element(data_ptr[index >> 6], 0x8000000000000000ull >> (index & 0x3F));
|
||||
}
|
||||
}
|
||||
[[deprecated]] bit_element operator[](u32 index);
|
||||
|
||||
// Index 0 returns the MSB and index 127 returns the LSB
|
||||
bool operator[](u32 index) const
|
||||
{
|
||||
const auto data_ptr = reinterpret_cast<const u64*>(m_data);
|
||||
|
||||
if constexpr (std::endian::little == std::endian::native)
|
||||
{
|
||||
return (data_ptr[1 - (index >> 6)] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (data_ptr[index >> 6] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
|
||||
}
|
||||
}
|
||||
[[deprecated]] bool operator[](u32 index) const;
|
||||
} _bit;
|
||||
|
||||
static v128 from64(u64 _0, u64 _1 = 0)
|
||||
@ -171,51 +137,39 @@ union alignas(16) v128
|
||||
static v128 from32p(u32 value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vi = _mm_set1_epi32(static_cast<s32>(value));
|
||||
ret._u32[0] = value;
|
||||
ret._u32[1] = value;
|
||||
ret._u32[2] = value;
|
||||
ret._u32[3] = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static v128 from16p(u16 value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vi = _mm_set1_epi16(static_cast<s16>(value));
|
||||
ret._u16[0] = value;
|
||||
ret._u16[1] = value;
|
||||
ret._u16[2] = value;
|
||||
ret._u16[3] = value;
|
||||
ret._u16[4] = value;
|
||||
ret._u16[5] = value;
|
||||
ret._u16[6] = value;
|
||||
ret._u16[7] = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static v128 from8p(u8 value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vi = _mm_set1_epi8(static_cast<s8>(value));
|
||||
std::memset(&ret, value, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static v128 fromBit(u32 bit)
|
||||
{
|
||||
v128 ret = {};
|
||||
ret._bit[bit] = true;
|
||||
return ret;
|
||||
}
|
||||
static inline v128 fromV(const __m128i& value);
|
||||
|
||||
static v128 fromV(__m128i value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vi = value;
|
||||
return ret;
|
||||
}
|
||||
static inline v128 fromF(const __m128& value);
|
||||
|
||||
static v128 fromF(__m128 value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vf = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static v128 fromD(__m128d value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vd = value;
|
||||
return ret;
|
||||
}
|
||||
static inline v128 fromD(const __m128d& value);
|
||||
|
||||
// Unaligned load with optional index offset
|
||||
static v128 loadu(const void* ptr, usz index = 0)
|
||||
@ -231,136 +185,46 @@ union alignas(16) v128
|
||||
std::memcpy(static_cast<u8*>(ptr) + index * sizeof(v128), &value, sizeof(v128));
|
||||
}
|
||||
|
||||
static inline v128 add8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_add_epi8(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 add8(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 add16(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_add_epi16(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 add16(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 add32(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_add_epi32(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 add32(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 addfs(const v128& left, const v128& right)
|
||||
{
|
||||
return fromF(_mm_add_ps(left.vf, right.vf));
|
||||
}
|
||||
static inline v128 addfs(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 addfd(const v128& left, const v128& right)
|
||||
{
|
||||
return fromD(_mm_add_pd(left.vd, right.vd));
|
||||
}
|
||||
static inline v128 addfd(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 sub8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_sub_epi8(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 sub8(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 sub16(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_sub_epi16(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 sub16(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 sub32(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_sub_epi32(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 sub32(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 subfs(const v128& left, const v128& right)
|
||||
{
|
||||
return fromF(_mm_sub_ps(left.vf, right.vf));
|
||||
}
|
||||
static inline v128 subfs(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 subfd(const v128& left, const v128& right)
|
||||
{
|
||||
return fromD(_mm_sub_pd(left.vd, right.vd));
|
||||
}
|
||||
static inline v128 subfd(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 maxu8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_max_epu8(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 maxu8(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 minu8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_min_epu8(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 minu8(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 eq8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 eq8(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 eq16(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_cmpeq_epi16(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 eq16(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 eq32(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_cmpeq_epi32(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 eq32(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 eq32f(const v128& left, const v128& right)
|
||||
{
|
||||
return fromF(_mm_cmpeq_ps(left.vf, right.vf));
|
||||
}
|
||||
static inline v128 eq32f(const v128& left, const v128& right);
|
||||
|
||||
static inline v128 eq64f(const v128& left, const v128& right)
|
||||
{
|
||||
return fromD(_mm_cmpeq_pd(left.vd, right.vd));
|
||||
}
|
||||
static inline v128 fma32f(v128 a, const v128& b, const v128& c);
|
||||
|
||||
static inline bool use_fma = false;
|
||||
bool operator==(const v128& right) const;
|
||||
|
||||
static inline v128 fma32f(v128 a, const v128& b, const v128& c)
|
||||
{
|
||||
#ifndef __FMA__
|
||||
if (use_fma) [[likely]]
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
|
||||
return a;
|
||||
#else
|
||||
__asm__("vfmadd213ps %[c], %[b], %[a]"
|
||||
: [a] "+x" (a.vf)
|
||||
: [b] "x" (b.vf)
|
||||
, [c] "x" (c.vf));
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
|
||||
}
|
||||
return a;
|
||||
#else
|
||||
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool operator==(const v128& right) const
|
||||
{
|
||||
return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff;
|
||||
}
|
||||
|
||||
bool operator!=(const v128& right) const
|
||||
{
|
||||
return !operator==(right);
|
||||
}
|
||||
bool operator!=(const v128& right) const;
|
||||
|
||||
// result = (~left) & (right)
|
||||
static inline v128 andnot(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_andnot_si128(left.vi, right.vi));
|
||||
}
|
||||
static inline v128 andnot(const v128& left, const v128& right);
|
||||
|
||||
void clear()
|
||||
{
|
||||
@ -377,23 +241,3 @@ struct offset32_array<v128::masked_array_t<T, N, M>>
|
||||
return u32{sizeof(T)} * (static_cast<u32>(arg) ^ static_cast<u32>(M));
|
||||
}
|
||||
};
|
||||
|
||||
inline v128 operator|(const v128& left, const v128& right)
|
||||
{
|
||||
return v128::fromV(_mm_or_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator&(const v128& left, const v128& right)
|
||||
{
|
||||
return v128::fromV(_mm_and_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator^(const v128& left, const v128& right)
|
||||
{
|
||||
return v128::fromV(_mm_xor_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator~(const v128& other)
|
||||
{
|
||||
return other ^ v128::from32p(UINT32_MAX); // XOR with ones
|
||||
}
|
||||
|
255
rpcs3/util/v128sse.hpp
Normal file
255
rpcs3/util/v128sse.hpp
Normal file
@ -0,0 +1,255 @@
|
||||
#pragma once
|
||||
|
||||
#include "util/types.hpp"
|
||||
#include "util/v128.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
inline bool v128_use_fma = utils::has_fma3();
|
||||
|
||||
class v128::bit_array_128::bit_element
|
||||
{
|
||||
u64& data;
|
||||
const u64 mask;
|
||||
|
||||
public:
|
||||
bit_element(u64& data, const u64 mask)
|
||||
: data(data)
|
||||
, mask(mask)
|
||||
{
|
||||
}
|
||||
|
||||
operator bool() const
|
||||
{
|
||||
return (data & mask) != 0;
|
||||
}
|
||||
|
||||
bit_element& operator=(const bool right)
|
||||
{
|
||||
if (right)
|
||||
{
|
||||
data |= mask;
|
||||
}
|
||||
else
|
||||
{
|
||||
data &= ~mask;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
bit_element& operator=(const bit_element& right)
|
||||
{
|
||||
if (right)
|
||||
{
|
||||
data |= mask;
|
||||
}
|
||||
else
|
||||
{
|
||||
data &= ~mask;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
[[deprecated]] inline v128::bit_array_128::bit_element v128::bit_array_128::operator[](u32 index)
|
||||
{
|
||||
const auto data_ptr = reinterpret_cast<u64*>(m_data);
|
||||
|
||||
if constexpr (std::endian::little == std::endian::native)
|
||||
{
|
||||
return bit_element(data_ptr[1 - (index >> 6)], 0x8000000000000000ull >> (index & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
return bit_element(data_ptr[index >> 6], 0x8000000000000000ull >> (index & 0x3F));
|
||||
}
|
||||
}
|
||||
|
||||
[[deprecated]] inline bool v128::bit_array_128::operator[](u32 index) const
|
||||
{
|
||||
const auto data_ptr = reinterpret_cast<const u64*>(m_data);
|
||||
|
||||
if constexpr (std::endian::little == std::endian::native)
|
||||
{
|
||||
return (data_ptr[1 - (index >> 6)] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (data_ptr[index >> 6] & (0x8000000000000000ull >> (index & 0x3F))) != 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline v128 v128::fromV(const __m128i& value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vi = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline v128 v128::fromF(const __m128& value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vf = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline v128 v128::fromD(const __m128d& value)
|
||||
{
|
||||
v128 ret;
|
||||
ret.vd = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline v128 v128::add8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_add_epi8(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::add16(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_add_epi16(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::add32(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_add_epi32(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::addfs(const v128& left, const v128& right)
|
||||
{
|
||||
return fromF(_mm_add_ps(left.vf, right.vf));
|
||||
}
|
||||
|
||||
inline v128 v128::addfd(const v128& left, const v128& right)
|
||||
{
|
||||
return fromD(_mm_add_pd(left.vd, right.vd));
|
||||
}
|
||||
|
||||
inline v128 v128::sub8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_sub_epi8(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::sub16(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_sub_epi16(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::sub32(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_sub_epi32(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::subfs(const v128& left, const v128& right)
|
||||
{
|
||||
return fromF(_mm_sub_ps(left.vf, right.vf));
|
||||
}
|
||||
|
||||
inline v128 v128::subfd(const v128& left, const v128& right)
|
||||
{
|
||||
return fromD(_mm_sub_pd(left.vd, right.vd));
|
||||
}
|
||||
|
||||
inline v128 v128::maxu8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_max_epu8(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::minu8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_min_epu8(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::eq8(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_cmpeq_epi8(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::eq16(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_cmpeq_epi16(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::eq32(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_cmpeq_epi32(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 v128::eq32f(const v128& left, const v128& right)
|
||||
{
|
||||
return fromF(_mm_cmpeq_ps(left.vf, right.vf));
|
||||
}
|
||||
|
||||
inline v128 v128::fma32f(v128 a, const v128& b, const v128& c)
|
||||
{
|
||||
#ifndef __FMA__
|
||||
if (v128_use_fma) [[likely]]
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
|
||||
return a;
|
||||
#else
|
||||
__asm__("vfmadd213ps %[c], %[b], %[a]"
|
||||
: [a] "+x" (a.vf)
|
||||
: [b] "x" (b.vf)
|
||||
, [c] "x" (c.vf));
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]);
|
||||
}
|
||||
return a;
|
||||
#else
|
||||
a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf);
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline bool v128::operator==(const v128& right) const
|
||||
{
|
||||
return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff;
|
||||
}
|
||||
|
||||
inline bool v128::operator!=(const v128& right) const
|
||||
{
|
||||
return !operator==(right);
|
||||
}
|
||||
|
||||
// result = (~left) & (right)
|
||||
inline v128 v128::andnot(const v128& left, const v128& right)
|
||||
{
|
||||
return fromV(_mm_andnot_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator|(const v128& left, const v128& right)
|
||||
{
|
||||
return v128::fromV(_mm_or_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator&(const v128& left, const v128& right)
|
||||
{
|
||||
return v128::fromV(_mm_and_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator^(const v128& left, const v128& right)
|
||||
{
|
||||
return v128::fromV(_mm_xor_si128(left.vi, right.vi));
|
||||
}
|
||||
|
||||
inline v128 operator~(const v128& other)
|
||||
{
|
||||
return other ^ v128::from32p(UINT32_MAX); // XOR with ones
|
||||
}
|
Loading…
Reference in New Issue
Block a user