From aba332d4c48d83150bc27eefd4aabfeb886f81ea Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 7 Sep 2021 18:35:00 +0300 Subject: [PATCH] SPU LLVM: make intrinsics for most xfloat instructions --- rpcs3/Emu/Cell/SPURecompiler.cpp | 673 ++++++++++++++++++++----------- 1 file changed, 440 insertions(+), 233 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 0721528eae..79410b3fa4 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -4149,6 +4149,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Set register value if (m_block) { +#ifndef _WIN32 + if (g_cfg.core.spu_debug) + value->setName(fmt::format("result_0x%05x", m_pos)); +#endif + m_block->reg.at(index) = saved_value; } @@ -6382,12 +6387,6 @@ public: return (std::forward(a) << 16 >> 16) * (std::forward(b) << 16 >> 16); } - template - auto fm(TA&& a, TB&& b) - { - return (std::forward(a)) * (std::forward(b)); - } - void SF(spu_opcode_t op) { set_vr(op.rt, get_vr(op.rb) - get_vr(op.ra)); @@ -7779,7 +7778,7 @@ public: bool is_input_positive(value_t a) { - if (auto [ok, v0, v1] = match_expr(a, fm(match(), match())); ok) + if (auto [ok, v0, v1] = match_expr(a, match() * match()); ok) { if (v0.value == v1.value) { @@ -7838,6 +7837,12 @@ public: return true; } + template + static llvm_calli frest(T&& a) + { + return {"spu_frest", {std::forward(a)}}; + } + void FREST(spu_opcode_t op) { // TODO @@ -7847,20 +7852,46 @@ public: const auto mask_ov = sext(bitcast(fabs(a)) > splat(0x7e7fffff)); const auto mask_de = eval(noncast(sext(fcmp_ord(a == fsplat(0.)))) >> 1); set_vr(op.rt, (bitcast(fre(a)) & ~mask_ov) | noncast(mask_de)); + return; } - else + + register_intrinsic("spu_frest", [&](llvm::CallInst* ci) { - set_vr(op.rt, fre(get_vr(op.ra))); - } + const auto a = value(ci->getOperand(0)); + return fre(a); + }); + + set_vr(op.rt, frest(get_vr(op.ra))); + } + + template + static llvm_calli frsqest(T&& a) + { + return {"spu_frsqest", {std::forward(a)}}; } void FRSQEST(spu_opcode_t op) { // TODO if (g_cfg.core.spu_accurate_xfloat) + { set_vr(op.rt, fsplat(1.0) / fsqrt(fabs(get_vr(op.ra)))); - else - set_vr(op.rt, frsqe(fabs(get_vr(op.ra)))); + return; + } + + register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + return frsqe(fabs(a)); + }); + + set_vr(op.rt, frsqest(get_vr(op.ra))); + } + + template + static llvm_calli fcgt(T&& a, U&& b) + { + return {"spu_fcgt", {std::forward(a), std::forward(b)}}; } void FCGT(spu_opcode_t op) @@ -7871,58 +7902,74 @@ public: return; } - const auto [a, b] = get_vrs(op.ra, op.rb); - const value_t ab[2]{a, b}; - - std::bitset<2> safe_int_compare(0); - std::bitset<2> safe_nonzero_compare(0); - - for (u32 i = 0; i < 2; i++) + register_intrinsic("spu_fcgt", [&](llvm::CallInst* ci) { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos); ok) + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + const value_t ab[2]{a, b}; + + std::bitset<2> safe_int_compare(0); + std::bitset<2> safe_nonzero_compare(0); + + for (u32 i = 0; i < 2; i++) { - safe_int_compare.set(i); - safe_nonzero_compare.set(i); - - for (u32 j = 0; j < 4; j++) + if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); + safe_int_compare.set(i); + safe_nonzero_compare.set(i); - if (value >= 0x7f7fffffu || !exponent) + for (u32 j = 0; j < 4; j++) { - // Postive or negative zero, Denormal (treated as zero), Negative constant, or Normalized number with exponent +127 - // Cannot used signed integer compare safely - // Note: Technically this optimization is accurate for any positive value, but due to the fact that - // we don't produce "extended range" values the same way as real hardware, it's not safe to apply - // this optimization for values outside of the range of x86 floating point hardware. - safe_int_compare.reset(i); - if (!exponent) safe_nonzero_compare.reset(i); + const u32 value = data._u32[j]; + const u8 exponent = static_cast(value >> 23); + + if (value >= 0x7f7fffffu || !exponent) + { + // Postive or negative zero, Denormal (treated as zero), Negative constant, or Normalized number with exponent +127 + // Cannot used signed integer compare safely + // Note: Technically this optimization is accurate for any positive value, but due to the fact that + // we don't produce "extended range" values the same way as real hardware, it's not safe to apply + // this optimization for values outside of the range of x86 floating point hardware. + safe_int_compare.reset(i); + if (!exponent) safe_nonzero_compare.reset(i); + } } } } - } - if (safe_int_compare.any()) - { - set_vr(op.rt, sext(bitcast(a) > bitcast(b))); - return; - } + if (safe_int_compare.any()) + { + return eval(sext(bitcast(a) > bitcast(b))); + } - if (g_cfg.core.spu_approx_xfloat) - { - const auto ai = eval(bitcast(a)); - const auto bi = eval(bitcast(b)); + if (g_cfg.core.spu_approx_xfloat) + { + const auto ai = eval(bitcast(a)); + const auto bi = eval(bitcast(b)); - if (!safe_nonzero_compare.any()) - set_vr(op.rt, sext(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi))); + if (!safe_nonzero_compare.any()) + { + return eval(sext(fcmp_uno(a != b) & select((ai & bi) >= 0, ai > bi, ai < bi))); + } + else + { + return eval(sext(select((ai & bi) >= 0, ai > bi, ai < bi))); + } + } else - set_vr(op.rt, sext(select((ai & bi) >= 0, ai > bi, ai < bi))); - } - else - { - set_vr(op.rt, sext(fcmp_ord(a > b))); - } + { + return eval(sext(fcmp_ord(a > b))); + } + }); + + set_vr(op.rt, fcgt(get_vr(op.ra), get_vr(op.rb))); + } + + template + static llvm_calli fcmgt(T&& a, U&& b) + { + return {"spu_fcmgt", {std::forward(a), std::forward(b)}}; } void FCMGT(spu_opcode_t op) @@ -7933,83 +7980,158 @@ public: return; } - const auto a = eval(fabs(get_vr(op.ra))); - const auto b = eval(fabs(get_vr(op.rb))); + register_intrinsic("spu_fcmgt", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); - if (g_cfg.core.spu_approx_xfloat) - { - set_vr(op.rt, sext(fcmp_uno(a > b) & (bitcast(a) > bitcast(b)))); - } - else - { - set_vr(op.rt, sext(fcmp_ord(a > b))); - } + const auto ma = eval(fabs(a)); + const auto mb = eval(fabs(b)); + + if (g_cfg.core.spu_approx_xfloat) + { + return eval(sext(fcmp_uno(ma > mb) & (bitcast(ma) > bitcast(mb)))); + } + else + { + return eval(sext(fcmp_ord(ma > mb))); + } + }); + + set_vr(op.rt, fcmgt(get_vr(op.ra), get_vr(op.rb))); + } + + template + static llvm_calli fa(T&& a, U&& b) + { + return {"spu_fa", {std::forward(a), std::forward(b)}}; } void FA(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) + { set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); - else - set_vr(op.rt, get_vr(op.ra) + get_vr(op.rb)); + return; + } + + register_intrinsic("spu_fa", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + return a + b; + }); + + set_vr(op.rt, fa(get_vr(op.ra), get_vr(op.rb))); + } + + template + static llvm_calli fs(T&& a, U&& b) + { + return {"spu_fs", {std::forward(a), std::forward(b)}}; } void FS(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) - set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); - else if (g_cfg.core.spu_approx_xfloat) { - const auto b = eval(clamp_smax(get_vr(op.rb))); // for #4478 - set_vr(op.rt, get_vr(op.ra) - b); + set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); + return; } - else - set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); + + register_intrinsic("spu_fs", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + if (g_cfg.core.spu_approx_xfloat) + { + const auto bc = clamp_smax(b); // for #4478 + return eval(a - bc); + } + else + { + return eval(a - b); + } + }); + + set_vr(op.rt, fs(get_vr(op.ra), get_vr(op.rb))); + } + + template + static llvm_calli fm(T&& a, U&& b) + { + return {"spu_fm", {std::forward(a), std::forward(b)}}; } void FM(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) - set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); - else if (g_cfg.core.spu_approx_xfloat) { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - - if (op.ra == op.rb && !m_interp_magn) - { - set_vr(op.rt, fm(a, b)); - return; - } - - const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); - const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); - const auto cx = eval(ma & mb); - const auto x = fm(a, b); - set_vr(op.rt, eval(bitcast(bitcast(x) & cx))); + set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); + return; } - else - set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb)); + + register_intrinsic("spu_fm", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + if (g_cfg.core.spu_approx_xfloat) + { + if (op.ra == op.rb && !m_interp_magn) + { + return eval(a * b); + } + + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + return eval(bitcast(bitcast(a * b) & ma & mb)); + } + else + { + return eval(a * b); + } + }); + + set_vr(op.rt, fm(get_vr(op.ra), get_vr(op.rb))); + } + + template + static llvm_calli fesd(T&& a) + { + return {"spu_fesd", {std::forward(a)}}; } void FESD(spu_opcode_t op) { if (g_cfg.core.spu_accurate_xfloat) { - const auto r = shuffle2(get_vr(op.ra), fsplat(0.), 1, 3); + const auto r = zshuffle(get_vr(op.ra), 1, 3); const auto d = bitcast(r); const auto a = eval(d & 0x7fffffffffffffff); const auto s = eval(d & 0x8000000000000000); const auto i = select(a == 0x47f0000000000000, eval(s | 0x7ff0000000000000), d); const auto n = select(a > 0x47f0000000000000, splat(0x7ff8000000000000), i); set_vr(op.rt, bitcast(n)); + return; } - else + + register_intrinsic("spu_fesd", [&](llvm::CallInst* ci) { - value_t r; - r.value = m_ir->CreateFPExt(shuffle2(get_vr(op.ra), fsplat(0.), 1, 3).eval(m_ir), get_type()); - set_vr(op.rt, r); - } + const auto a = value(ci->getOperand(0)); + + return fpcast(zshuffle(a, 1, 3)); + }); + + set_vr(op.rt, fesd(get_vr(op.ra))); + } + + template + static llvm_calli frds(T&& a) + { + return {"spu_frds", {std::forward(a)}}; } void FRDS(spu_opcode_t op) @@ -8023,14 +8145,24 @@ public: const auto i = select(a > 0x47f0000000000000, eval(s | 0x47f0000000000000), d); const auto n = select(a > 0x7ff0000000000000, splat(0x47f8000000000000), i); const auto z = select(a < 0x3810000000000000, s, n); - set_vr(op.rt, shuffle2(bitcast(z), fsplat(0.), 2, 0, 3, 1), false); + set_vr(op.rt, zshuffle(bitcast(z), 2, 0, 3, 1), false); + return; } - else + + register_intrinsic("spu_frds", [&](llvm::CallInst* ci) { - value_t r; - r.value = m_ir->CreateFPTrunc(get_vr(op.ra).value, get_type()); - set_vr(op.rt, shuffle2(r, fsplat(0.), 2, 0, 3, 1)); - } + const auto a = value(ci->getOperand(0)); + + return zshuffle(fpcast(a), 2, 0, 3, 1); + }); + + set_vr(op.rt, frds(get_vr(op.ra))); + } + + template + static llvm_calli fceq(T&& a, U&& b) + { + return {"spu_fceq", {std::forward(a), std::forward(b)}}; } void FCEQ(spu_opcode_t op) @@ -8041,61 +8173,70 @@ public: return; } - const auto [a, b] = get_vrs(op.ra, op.rb); - const value_t ab[2]{a, b}; - - std::bitset<2> safe_float_compare(0); - std::bitset<2> safe_int_compare(0); - - for (u32 i = 0; i < 2; i++) + register_intrinsic("spu_fceq", [&](llvm::CallInst* ci) { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos); ok) + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + const value_t ab[2]{a, b}; + + std::bitset<2> safe_float_compare(0); + std::bitset<2> safe_int_compare(0); + + for (u32 i = 0; i < 2; i++) { - safe_float_compare.set(i); - safe_int_compare.set(i); - - for (u32 j = 0; j < 4; j++) + if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); + safe_float_compare.set(i); + safe_int_compare.set(i); - // unsafe if nan - if (exponent == 255) + for (u32 j = 0; j < 4; j++) { - safe_float_compare.reset(i); - } + const u32 value = data._u32[j]; + const u8 exponent = static_cast(value >> 23); - // unsafe if denormal or 0 - if (!exponent) - { - safe_int_compare.reset(i); + // unsafe if nan + if (exponent == 255) + { + safe_float_compare.reset(i); + } + + // unsafe if denormal or 0 + if (!exponent) + { + safe_int_compare.reset(i); + } } } } - } - if (safe_float_compare.any()) - { - set_vr(op.rt, sext(fcmp_ord(a == b))); - return; - } + if (safe_float_compare.any()) + { + return eval(sext(fcmp_ord(a == b))); + } - if (safe_int_compare.any()) - { - set_vr(op.rt, sext(bitcast(a) == bitcast(b))); - return; - } + if (safe_int_compare.any()) + { + return eval(sext(bitcast(a) == bitcast(b))); + } - if (g_cfg.core.spu_approx_xfloat) - { - const auto ai = eval(bitcast(a)); - const auto bi = eval(bitcast(b)); - set_vr(op.rt, sext(fcmp_ord(a == b)) | sext(ai == bi)); - } - else - { - set_vr(op.rt, sext(fcmp_ord(a == b))); - } + if (g_cfg.core.spu_approx_xfloat) + { + return eval(sext(fcmp_ord(a == b)) | sext(bitcast(a) == bitcast(b))); + } + else + { + return eval(sext(fcmp_ord(a == b))); + } + }); + + set_vr(op.rt, fceq(get_vr(op.ra), get_vr(op.rb))); + } + + template + static llvm_calli fcmeq(T&& a, U&& b) + { + return {"spu_fcmeq", {std::forward(a), std::forward(b)}}; } void FCMEQ(spu_opcode_t op) @@ -8106,84 +8247,83 @@ public: return; } - const auto [a, b] = get_vrs(op.ra, op.rb); - const value_t ab[2]{a, b}; - - std::bitset<2> safe_float_compare(0); - std::bitset<2> safe_int_compare(0); - - for (u32 i = 0; i < 2; i++) + register_intrinsic("spu_fcmeq", [&](llvm::CallInst* ci) { - if (auto [ok, data] = get_const_vector(ab[i].value, m_pos); ok) + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + + const value_t ab[2]{a, b}; + + std::bitset<2> safe_float_compare(0); + std::bitset<2> safe_int_compare(0); + + for (u32 i = 0; i < 2; i++) { - safe_float_compare.set(i); - safe_int_compare.set(i); - - for (u32 j = 0; j < 4; j++) + if (auto [ok, data] = get_const_vector(ab[i].value, m_pos, __LINE__ + i); ok) { - const u32 value = data._u32[j]; - const u8 exponent = static_cast(value >> 23); + safe_float_compare.set(i); + safe_int_compare.set(i); - // unsafe if nan - if (exponent == 255) + for (u32 j = 0; j < 4; j++) { - safe_float_compare.reset(i); - } + const u32 value = data._u32[j]; + const u8 exponent = static_cast(value >> 23); - // unsafe if denormal or 0 - if (!exponent) - { - safe_int_compare.reset(i); + // unsafe if nan + if (exponent == 255) + { + safe_float_compare.reset(i); + } + + // unsafe if denormal or 0 + if (!exponent) + { + safe_int_compare.reset(i); + } } } } - } - const auto fa = eval(fabs(a)); - const auto fb = eval(fabs(b)); + const auto fa = eval(fabs(a)); + const auto fb = eval(fabs(b)); - if (safe_float_compare.any()) - { - set_vr(op.rt, sext(fcmp_ord(fa == fb))); - return; - } + if (safe_float_compare.any()) + { + return eval(sext(fcmp_ord(fa == fb))); + } - if (safe_int_compare.any()) - { - set_vr(op.rt, sext(bitcast(fa) == bitcast(fb))); - return; - } + if (safe_int_compare.any()) + { + return eval(sext(bitcast(fa) == bitcast(fb))); + } - if (g_cfg.core.spu_approx_xfloat) - { - const auto ai = eval(bitcast(fa)); - const auto bi = eval(bitcast(fb)); - set_vr(op.rt, sext(fcmp_ord(fa == fb)) | sext(ai == bi)); - } - else - { - set_vr(op.rt, sext(fcmp_ord(fa == fb))); - } + if (g_cfg.core.spu_approx_xfloat) + { + return eval(sext(fcmp_ord(fa == fb)) | sext(bitcast(fa) == bitcast(fb))); + } + else + { + return eval(sext(fcmp_ord(fa == fb))); + } + }); + + set_vr(op.rt, fcmeq(get_vr(op.ra), get_vr(op.rb))); } value_t fma32x4(value_t a, value_t b, value_t c) { - value_t r; - // Optimization: Emit only a floating multiply if the addend is zero // This is odd since SPU code could just use the FM instruction, but it seems common enough if (auto [ok, data] = get_const_vector(c.value, m_pos); ok) { if (is_spu_float_zero(data, -1)) { - r = eval(a * b); - return r; + return eval(a * b); } if (!m_use_fma && is_spu_float_zero(data, +1)) { - r = eval(a * b + fsplat(0.f)); - return r; + return eval(a * b + fsplat(0.f)); } } @@ -8230,66 +8370,130 @@ public: if (m_use_fma) { - r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {a.value, b.value, c.value}); - return r; + return eval(fmuladd(a, b, c, true)); } // Convert to doubles - const auto xa = m_ir->CreateFPExt(a.value, get_type()); - const auto xb = m_ir->CreateFPExt(b.value, get_type()); - const auto xc = m_ir->CreateFPExt(c.value, get_type()); - const auto xr = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fmuladd), {xa, xb, xc}); - r.value = m_ir->CreateFPTrunc(xr, get_type()); - return r; + const auto xa = fpcast(a); + const auto xb = fpcast(b); + const auto xc = fpcast(c); + const auto xr = fmuladd(xa, xb, xc, false); + return eval(fpcast(xr)); + } + + template + static llvm_calli fnms(T&& a, U&& b, V&& c) + { + return {"spu_fnms", {std::forward(a), std::forward(b), std::forward(c)}}; } void FNMS(spu_opcode_t op) { // See FMA. if (g_cfg.core.spu_accurate_xfloat) - set_vr(op.rt4, fmuladd(eval(-get_vr(op.ra)), get_vr(op.rb), get_vr(op.rc))); - else if (g_cfg.core.spu_approx_xfloat) { - const auto a = eval(clamp_smax(get_vr(op.ra))); - const auto b = eval(clamp_smax(get_vr(op.rb))); - set_vr(op.rt4, fma32x4(eval(-(a)), (b), get_vr(op.rc))); + const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); + set_vr(op.rt4, fmuladd(-a, b, c)); + return; } - else - set_vr(op.rt4, fma32x4(eval(-get_vr(op.ra)), get_vr(op.rb), get_vr(op.rc))); + + register_intrinsic("spu_fnms", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + const auto c = value(ci->getOperand(2)); + + if (g_cfg.core.spu_approx_xfloat) + { + return fma32x4(eval(-clamp_smax(a)), clamp_smax(b), c); + } + else + { + return fma32x4(eval(-a), b, c); + } + }); + + set_vr(op.rt4, fnms(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); + } + + template + static llvm_calli fma(T&& a, U&& b, V&& c) + { + return {"spu_fma", {std::forward(a), std::forward(b), std::forward(c)}}; } void FMA(spu_opcode_t op) { // Hardware FMA produces the same result as multiple + add on the limited double range (xfloat). if (g_cfg.core.spu_accurate_xfloat) - set_vr(op.rt4, fmuladd(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); - else if (g_cfg.core.spu_approx_xfloat) { - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); - const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); - const auto ca = eval(bitcast(bitcast(a) & mb)); - const auto cb = eval(bitcast(bitcast(b) & ma)); - set_vr(op.rt4, fma32x4((ca), (cb), get_vr(op.rc))); + const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); + set_vr(op.rt4, fmuladd(a, b, c)); + return; } - else - set_vr(op.rt4, fma32x4(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); + + register_intrinsic("spu_fma", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + const auto c = value(ci->getOperand(2)); + + if (g_cfg.core.spu_approx_xfloat) + { + const auto ma = sext(fcmp_uno(a != fsplat(0.))); + const auto mb = sext(fcmp_uno(b != fsplat(0.))); + const auto ca = bitcast(bitcast(a) & mb); + const auto cb = bitcast(bitcast(b) & ma); + return fma32x4(eval(ca), eval(cb), c); + } + else + { + return fma32x4(a, b, c); + } + }); + + set_vr(op.rt4, fma(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); + } + + template + static llvm_calli fms(T&& a, U&& b, V&& c) + { + return {"spu_fms", {std::forward(a), std::forward(b), std::forward(c)}}; } void FMS(spu_opcode_t op) { // See FMA. if (g_cfg.core.spu_accurate_xfloat) - set_vr(op.rt4, fmuladd(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); - else if (g_cfg.core.spu_approx_xfloat) { - const auto a = eval(clamp_smax(get_vr(op.ra))); - const auto b = eval(clamp_smax(get_vr(op.rb))); - set_vr(op.rt4, fma32x4((a), (b), eval(-get_vr(op.rc)))); + const auto [a, b, c] = get_vrs(op.ra, op.rb, op.rc); + set_vr(op.rt4, fmuladd(a, b, -c)); + return; } - else - set_vr(op.rt4, fma32x4(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); + + register_intrinsic("spu_fms", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto b = value(ci->getOperand(1)); + const auto c = value(ci->getOperand(2)); + + if (g_cfg.core.spu_approx_xfloat) + { + return fma32x4(clamp_smax(a), clamp_smax(b), eval(-c)); + } + else + { + return fma32x4(a, b, eval(-c)); + } + }); + + set_vr(op.rt4, fms(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); + } + + template + static llvm_calli fi(T&& a, U&& b) + { + return {"spu_fi", {std::forward(a), std::forward(b)}}; } void FI(spu_opcode_t op) @@ -8309,18 +8513,21 @@ public: // const auto step = fpcast(bitcast(b) & mask_sf) * fsplat(std::exp2(-13.f)); // const auto yval = fpcast(bitcast(a) & mask_yf) * fsplat(std::exp2(-19.f)); // set_vr(op.rt, bitcast((bitcast(b) & mask_se) | (bitcast(base - step * yval) & ~mask_se))); + return; } - else - { - const auto [a, b] = get_vrs(op.ra, op.rb); - const auto mask_se = splat(0xff800000u); // Sign and exponent mask + register_intrinsic("spu_fi", [&](llvm::CallInst* ci) + { + const auto a = bitcast(value(ci->getOperand(0))); + const auto b = bitcast(value(ci->getOperand(1))); const auto base = (b & 0x007ffc00u) << 9; // Base fraction const auto ymul = (b & 0x3ff) * (a & 0x7ffff); // Step fraction * Y fraction (fixed point at 2^-32) const auto bnew = bitcast((base - ymul) >> 9) + (sext(ymul <= base) & (1 << 23)); // Subtract and correct invisible fraction bit - set_vr(op.rt, (b & mask_se) | (bitcast(fpcast(bnew)) & ~mask_se)); // Inject old sign and exponent - } + return bitcast((b & 0xff800000u) | (bitcast(fpcast(bnew)) & ~0xff800000u)); // Inject old sign and exponent + }); + + set_vr(op.rt, fi(get_vr(op.ra), get_vr(op.rb))); } void CFLTS(spu_opcode_t op)