1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 18:53:28 +01:00

SPU LLVM: approximate xfloat option

Adapt previous SPU ASMJIT changes made by @kd-11
FM, FMA, FNMS, FMS are approximated.
FCGT, FCMGT are accurate.
This commit is contained in:
Nekotekina 2018-12-17 02:05:26 +03:00
parent f750b4c420
commit d7be0a96f3
2 changed files with 79 additions and 2 deletions

View File

@ -4946,17 +4946,58 @@ public:
void FCGT(spu_opcode_t op) //
{
if (g_cfg.core.spu_accurate_xfloat)
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb))));
return;
}
const auto a = get_vr<f32[4]>(op.ra);
const auto b = get_vr<f32[4]>(op.rb);
// See FCMGT.
if (g_cfg.core.spu_approx_xfloat)
{
const auto ia = bitcast<s32[4]>(fabs(a));
const auto ib = bitcast<s32[4]>(fabs(b));
const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
// Use sign bits to invert abs values before comparison.
const auto ca = eval(ia ^ (bitcast<s32[4]>(a) >> 31));
const auto cb = eval(ib ^ (bitcast<s32[4]>(b) >> 31));
set_vr(op.rt, sext<u32[4]>((ca > cb) & nz));
}
else
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))));
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(a, b)));
}
}
void FCMGT(spu_opcode_t op) //
{
if (g_cfg.core.spu_accurate_xfloat)
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(fabs(get_vr<f64[4]>(op.ra)), fabs(get_vr<f64[4]>(op.rb)))));
return;
}
const auto a = get_vr<f32[4]>(op.ra);
const auto b = get_vr<f32[4]>(op.rb);
const auto abs_a = fabs(a);
const auto abs_b = fabs(b);
// Actually, it's accurate and can be used as an alternative path for accurate xfloat.
if (g_cfg.core.spu_approx_xfloat)
{
// Compare abs values as integers, but return false if both are denormals or zeros.
const auto ia = bitcast<s32[4]>(abs_a);
const auto ib = bitcast<s32[4]>(abs_b);
const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff));
set_vr(op.rt, sext<u32[4]>((ia > ib) & nz));
}
else
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(fabs(get_vr<f32[4]>(op.ra)), fabs(get_vr<f32[4]>(op.rb)))));
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(abs_a, abs_b)));
}
}
void FA(spu_opcode_t op) //
@ -4979,6 +5020,26 @@ public:
{
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, get_vr<f64[4]>(op.ra) * get_vr<f64[4]>(op.rb));
else if (g_cfg.core.spu_approx_xfloat)
{
const auto a = get_vr<f32[4]>(op.ra);
const auto b = get_vr<f32[4]>(op.rb);
const auto m = eval(a * b);
const auto abs_a = bitcast<s32[4]>(fabs(a));
const auto abs_b = bitcast<s32[4]>(fabs(b));
const auto abs_m = bitcast<s32[4]>(fabs(m));
const auto sign_a = eval(bitcast<s32[4]>(a) & 0x80000000);
const auto sign_b = eval(bitcast<s32[4]>(b) & 0x80000000);
const auto smod_m = eval(bitcast<s32[4]>(m) & 0x7fffffff);
const auto fmax_m = eval((sign_a ^ sign_b) | 0x7fffffff);
const auto nzero = eval((abs_a > 0x7fffff) & (abs_b > 0x7fffff) & (abs_m > 0x7fffff));
// If m produces Inf or NaN, flush it to max xfloat with appropriate sign
const auto clamp = select(smod_m > 0x7f7fffff, bitcast<f32[4]>(fmax_m), m);
// If a, b, or a * b is a denorm or zero, return zero
set_vr(op.rt, select(nzero, clamp, fsplat<f32[4]>(0.)));
}
else
set_vr(op.rt, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
}
@ -5040,11 +5101,22 @@ public:
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(fabs(get_vr<f32[4]>(op.ra)), fabs(get_vr<f32[4]>(op.rb)))));
}
// Multiply and return zero if any of the arguments is in the xfloat range.
value_t<f32[4]> mzero_if_xtended(value_t<f32[4]> a, value_t<f32[4]> b)
{
// Compare absolute values with max positive float in normal range.
const auto aa = bitcast<s32[4]>(fabs(a));
const auto ab = bitcast<s32[4]>(fabs(b));
return select(eval(max(aa, ab) > 0x7f7fffff), fsplat<f32[4]>(0.), eval(a * b));
}
void FNMS(spu_opcode_t op) //
{
// See FMA.
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt4, -fmuladd(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb), eval(-get_vr<f64[4]>(op.rc))));
else if (g_cfg.core.spu_approx_xfloat)
set_vr(op.rt4, get_vr<f32[4]>(op.rc) - mzero_if_xtended(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)));
else
set_vr(op.rt4, get_vr<f32[4]>(op.rc) - get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
}
@ -5054,6 +5126,8 @@ public:
// Hardware FMA produces the same result as multiple + add on the limited double range (xfloat).
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt4, fmuladd(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb), get_vr<f64[4]>(op.rc)));
else if (g_cfg.core.spu_approx_xfloat)
set_vr(op.rt4, mzero_if_xtended(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)) + get_vr<f32[4]>(op.rc));
else
set_vr(op.rt4, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb) + get_vr<f32[4]>(op.rc));
}
@ -5063,6 +5137,8 @@ public:
// See FMA.
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt4, fmuladd(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb), eval(-get_vr<f64[4]>(op.rc))));
else if (g_cfg.core.spu_approx_xfloat)
set_vr(op.rt4, mzero_if_xtended(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb)) - get_vr<f32[4]>(op.rc));
else
set_vr(op.rt4, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb) - get_vr<f32[4]>(op.rc));
}

View File

@ -378,6 +378,7 @@ struct cfg_root : cfg::node
cfg::_bool spu_cache{this, "SPU Cache", true};
cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
cfg::_bool spu_accurate_xfloat{this, "Accurate xfloat", false};
cfg::_bool spu_approx_xfloat{this, "Approximate xfloat", true};
cfg::_enum<lib_loading_type> lib_loading{this, "Lib Loader", lib_loading_type::liblv2only};
cfg::_bool hook_functions{this, "Hook static functions"};