2013-11-19 11:30:58 +01:00
|
|
|
#pragma once
|
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
#include <cstdint>
|
|
|
|
#include <immintrin.h>
|
2014-10-07 15:35:44 +02:00
|
|
|
#include <emmintrin.h>
|
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
#define IS_LE_MACHINE 1
|
|
|
|
#define IS_BE_MACHINE 0
|
|
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
#include <intrin.h>
|
|
|
|
#else
|
|
|
|
#include <x86intrin.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// Some platforms don't support thread_local well yet.
|
|
|
|
#ifndef _MSC_VER
|
2014-06-19 15:50:18 +02:00
|
|
|
#define thread_local __thread
|
2016-02-01 22:55:43 +01:00
|
|
|
#define __assume(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
|
2014-06-19 15:50:18 +02:00
|
|
|
#endif
|
|
|
|
|
2015-05-28 17:14:22 +02:00
|
|
|
#if defined(_MSC_VER)
|
2016-02-01 22:55:43 +01:00
|
|
|
#define safe_buffers __declspec(safebuffers)
|
2015-05-28 17:14:22 +02:00
|
|
|
#else
|
2016-02-01 22:55:43 +01:00
|
|
|
#define safe_buffers
|
2015-05-28 17:14:22 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(_MSC_VER)
|
2016-02-01 22:55:43 +01:00
|
|
|
#define never_inline __declspec(noinline)
|
2014-07-14 21:15:30 +02:00
|
|
|
#else
|
2016-02-01 22:55:43 +01:00
|
|
|
#define never_inline __attribute__((noinline))
|
2014-07-14 21:15:30 +02:00
|
|
|
#endif
|
|
|
|
|
2015-05-28 17:14:22 +02:00
|
|
|
#if defined(_MSC_VER)
|
|
|
|
#define force_inline __forceinline
|
2015-02-02 10:14:49 +01:00
|
|
|
#else
|
2016-03-07 02:09:42 +01:00
|
|
|
#define force_inline __attribute__((always_inline)) inline
|
2015-02-02 10:14:49 +01:00
|
|
|
#endif
|
|
|
|
|
2013-11-19 11:30:58 +01:00
|
|
|
#if defined(__GNUG__)
|
2015-05-27 05:11:59 +02:00
|
|
|
|
2014-04-10 00:54:32 +02:00
|
|
|
#include <stdlib.h>
|
2014-04-29 00:51:49 +02:00
|
|
|
|
2014-02-23 17:52:52 +01:00
|
|
|
#define _fpclass(x) std::fpclassify(x)
|
2013-11-19 11:30:58 +01:00
|
|
|
#define INFINITE 0xFFFFFFFF
|
2014-06-02 19:46:42 +02:00
|
|
|
|
2014-07-12 09:02:39 +02:00
|
|
|
#ifdef __APPLE__
|
2015-05-27 05:11:59 +02:00
|
|
|
|
2015-08-18 23:27:41 +02:00
|
|
|
// XXX only supports a single timer
|
|
|
|
#define TIMER_ABSTIME -1
|
|
|
|
/* The opengroup spec isn't clear on the mapping from REALTIME to CALENDAR
|
|
|
|
being appropriate or not.
|
|
|
|
http://pubs.opengroup.org/onlinepubs/009695299/basedefs/time.h.html */
|
|
|
|
#define CLOCK_REALTIME 1 // #define CALENDAR_CLOCK 1 from mach/clock_types.h
|
|
|
|
#define CLOCK_MONOTONIC 0 // #define SYSTEM_CLOCK 0
|
2014-04-29 00:51:49 +02:00
|
|
|
|
2015-08-18 23:27:41 +02:00
|
|
|
typedef int clockid_t;
|
|
|
|
|
|
|
|
/* the mach kernel uses struct mach_timespec, so struct timespec
|
|
|
|
is loaded from <sys/_types/_timespec.h> for compatability */
|
|
|
|
// struct timespec { time_t tv_sec; long tv_nsec; };
|
|
|
|
|
|
|
|
int clock_gettime(clockid_t clk_id, struct timespec *tp);
|
2014-04-29 00:51:49 +02:00
|
|
|
|
2014-07-12 09:02:39 +02:00
|
|
|
#endif /* __APPLE__ */
|
2015-08-06 22:20:48 +02:00
|
|
|
#endif /* __GNUG__ */
|
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
inline std::uint32_t cntlz32(std::uint32_t arg)
|
|
|
|
{
|
2015-08-06 22:20:48 +02:00
|
|
|
#if defined(_MSC_VER)
|
2016-02-01 22:55:43 +01:00
|
|
|
unsigned long res;
|
|
|
|
return _BitScanReverse(&res, arg) ? res ^ 31 : 32;
|
|
|
|
#else
|
|
|
|
return arg ? __builtin_clzll(arg) - 32 : 32;
|
|
|
|
#endif
|
|
|
|
}
|
2015-09-26 22:46:04 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
inline std::uint64_t cntlz64(std::uint64_t arg)
|
2015-08-06 22:20:48 +02:00
|
|
|
{
|
2016-02-01 22:55:43 +01:00
|
|
|
#if defined(_MSC_VER)
|
|
|
|
unsigned long res;
|
|
|
|
return _BitScanReverse64(&res, arg) ? res ^ 63 : 64;
|
|
|
|
#else
|
|
|
|
return arg ? __builtin_clzll(arg) : 64;
|
|
|
|
#endif
|
|
|
|
}
|
2015-09-13 00:37:57 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
template<typename T>
|
|
|
|
struct add_flags_result_t
|
|
|
|
{
|
|
|
|
T result;
|
|
|
|
bool carry;
|
|
|
|
//bool overflow;
|
|
|
|
bool zero;
|
|
|
|
bool sign;
|
2015-09-13 00:37:57 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
add_flags_result_t() = default;
|
2015-09-13 00:37:57 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
// Straighforward ADD with flags
|
|
|
|
add_flags_result_t(T a, T b)
|
|
|
|
: result(a + b)
|
|
|
|
, carry(result < a)
|
|
|
|
//, overflow((result ^ ~(a ^ b)) >> (sizeof(T) * 8 - 1) != 0)
|
|
|
|
, zero(result == 0)
|
|
|
|
, sign(result >> (sizeof(T) * 8 - 1) != 0)
|
2015-09-13 00:37:57 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
// Straighforward ADC with flags
|
|
|
|
add_flags_result_t(T a, T b, bool c)
|
|
|
|
: add_flags_result_t(a, b)
|
2015-09-13 00:37:57 +02:00
|
|
|
{
|
2016-02-01 22:55:43 +01:00
|
|
|
add_flags_result_t r(result, c);
|
|
|
|
result = r.result;
|
|
|
|
carry |= r.carry;
|
|
|
|
//overflow |= r.overflow;
|
|
|
|
zero = r.zero;
|
|
|
|
sign = r.sign;
|
2015-09-13 00:37:57 +02:00
|
|
|
}
|
2016-02-01 22:55:43 +01:00
|
|
|
};
|
2015-09-13 00:37:57 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
inline add_flags_result_t<std::uint32_t> add32_flags(std::uint32_t a, std::uint32_t b)
|
|
|
|
{
|
|
|
|
//add_flags_result_t<std::uint32_t> r;
|
|
|
|
//r.carry = _addcarry_u32(0, a, b, &r.result) != 0;
|
|
|
|
//r.zero = r.result == 0;
|
|
|
|
//r.sign = r.result >> 31;
|
|
|
|
//return r;
|
2015-09-13 00:37:57 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
return{ a, b };
|
|
|
|
}
|
2015-09-13 00:37:57 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
inline add_flags_result_t<std::uint32_t> add32_flags(std::uint32_t a, std::uint32_t b, bool c)
|
|
|
|
{
|
|
|
|
return{ a, b, c };
|
|
|
|
}
|
2015-08-06 22:20:48 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
inline add_flags_result_t<std::uint64_t> add64_flags(std::uint64_t a, std::uint64_t b)
|
2014-09-27 20:49:33 +02:00
|
|
|
{
|
2016-02-01 22:55:43 +01:00
|
|
|
return{ a, b };
|
2014-09-27 20:49:33 +02:00
|
|
|
}
|
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
inline add_flags_result_t<std::uint64_t> add64_flags(std::uint64_t a, std::uint64_t b, bool c)
|
2014-09-27 20:49:33 +02:00
|
|
|
{
|
2016-02-01 22:55:43 +01:00
|
|
|
return{ a, b, c };
|
2014-09-27 20:49:33 +02:00
|
|
|
}
|
2014-10-07 15:35:44 +02:00
|
|
|
|
2016-02-01 22:55:43 +01:00
|
|
|
// Compare 16 packed unsigned bytes (greater than)
|
2015-03-26 19:42:12 +01:00
|
|
|
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
|
2014-10-07 15:35:44 +02:00
|
|
|
{
|
|
|
|
// (A xor 0x80) > (B xor 0x80)
|
2015-03-26 19:42:12 +01:00
|
|
|
const auto sign = _mm_set1_epi32(0x80808080);
|
|
|
|
return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
|
2014-10-07 15:35:44 +02:00
|
|
|
}
|
|
|
|
|
2015-03-26 19:42:12 +01:00
|
|
|
inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
|
2014-10-07 15:35:44 +02:00
|
|
|
{
|
2015-03-26 19:42:12 +01:00
|
|
|
const auto sign = _mm_set1_epi32(0x80008000);
|
|
|
|
return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
|
|
|
|
}
|
|
|
|
|
|
|
|
inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
|
|
|
|
{
|
|
|
|
const auto sign = _mm_set1_epi32(0x80000000);
|
|
|
|
return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
|
2014-10-07 15:35:44 +02:00
|
|
|
}
|
2015-03-29 13:00:10 +02:00
|
|
|
|
|
|
|
inline __m128 sse_exp2_ps(__m128 A)
|
|
|
|
{
|
|
|
|
const auto x0 = _mm_max_ps(_mm_min_ps(A, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
|
|
|
|
const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
|
|
|
|
const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
|
|
|
|
const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
|
|
|
|
const auto x4 = _mm_mul_ps(x3, x3);
|
|
|
|
const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
|
|
|
|
const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
|
|
|
|
return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
|
|
|
|
}
|
|
|
|
|
|
|
|
inline __m128 sse_log2_ps(__m128 A)
|
|
|
|
{
|
|
|
|
const auto _1 = _mm_set1_ps(1.0f);
|
|
|
|
const auto _c = _mm_set1_ps(1.442695040f);
|
|
|
|
const auto x0 = _mm_max_ps(A, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
|
|
|
|
const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
|
|
|
|
const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
|
|
|
|
const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
|
|
|
|
const auto x4 = _mm_add_ps(x3, x3);
|
|
|
|
const auto x5 = _mm_mul_ps(x4, x4);
|
|
|
|
const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
|
|
|
|
const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
|
|
|
|
const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
|
|
|
|
return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
|
|
|
|
}
|
2016-02-01 22:55:43 +01:00
|
|
|
|
|
|
|
// Helper function, used by ""_u16, ""_u32, ""_u64
|
|
|
|
constexpr std::uint8_t to_u8(char c)
|
|
|
|
{
|
|
|
|
return static_cast<std::uint8_t>(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convert 2-byte string to u16 value like reinterpret_cast does
|
|
|
|
constexpr std::uint16_t operator""_u16(const char* s, std::size_t length)
|
|
|
|
{
|
|
|
|
return length != 2 ? throw s :
|
|
|
|
#if IS_LE_MACHINE == 1
|
|
|
|
to_u8(s[1]) << 8 | to_u8(s[0]);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convert 4-byte string to u32 value like reinterpret_cast does
|
|
|
|
constexpr std::uint32_t operator""_u32(const char* s, std::size_t length)
|
|
|
|
{
|
|
|
|
return length != 4 ? throw s :
|
|
|
|
#if IS_LE_MACHINE == 1
|
|
|
|
to_u8(s[3]) << 24 | to_u8(s[2]) << 16 | to_u8(s[1]) << 8 | to_u8(s[0]);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convert 8-byte string to u64 value like reinterpret_cast does
|
|
|
|
constexpr std::uint64_t operator""_u64(const char* s, std::size_t length)
|
|
|
|
{
|
|
|
|
return length != 8 ? throw s :
|
|
|
|
#if IS_LE_MACHINE == 1
|
|
|
|
static_cast<std::uint64_t>(to_u8(s[7]) << 24 | to_u8(s[6]) << 16 | to_u8(s[5]) << 8 | to_u8(s[4])) << 32 | to_u8(s[3]) << 24 | to_u8(s[2]) << 16 | to_u8(s[1]) << 8 | to_u8(s[0]);
|
|
|
|
#endif
|
|
|
|
}
|