diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 914474d93e..6d9a70e432 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -247,10 +247,16 @@ inline FT build_function_asm(std::string_view name, F&& builder) Asm compiler(&code); compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign); - if constexpr (std::is_invocable_v) - builder(compiler, args); + if constexpr (std::is_invocable_r_v) + { + if (!builder(compiler, args)) + return nullptr; + } else - builder(compiler); + { + builder(compiler, args); + } + rt.dump_name = name; const auto result = rt._add(&code); jit_announce(result, code.codeSize(), name); diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 26ad65d79f..d658e0557c 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -111,13 +111,15 @@ struct ppu_exec_select #define RETURN(...) \ if constexpr (Build == 0) { \ static_cast(exec); \ - static const ppu_intrp_func_t f = build_function_asm("ppu_"s + __func__, [&](asmjit::ppu_builder& c) { \ + static const ppu_intrp_func_t f = build_function_asm("ppu_"s + __func__, [&](asmjit::ppu_builder& c, native_args&) { \ static ppu_opcode_t op{}; \ static ppu_abstract_t ppu; \ exec(__VA_ARGS__); \ c.ppu_ret(); \ + return !c.fail_flag; \ }); \ - return f; \ + if (f) return f; \ + RETURN_(__VA_ARGS__); \ } #else #define RETURN RETURN_ @@ -1019,7 +1021,7 @@ auto VADDUWS() } }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); + RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.sat); } template @@ -2074,7 +2076,7 @@ auto VNOR() d = gv_notfs(gv_orfs(std::move(a), std::move(b))); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2100,7 +2102,7 @@ auto VPERM() #if defined (ARCH_X64) if constexpr (Build == 0) { - static const ppu_intrp_func_t f = build_function_asm("ppu_VPERM", [&](asmjit::ppu_builder& c) + static const ppu_intrp_func_t f = build_function_asm("ppu_VPERM", [&](asmjit::ppu_builder& c, native_args&) { const auto [v0, v1, v2, v3] = c.vec_alloc<4>(); c.movdqa(v0, c.ppu_vr(s_op.vc)); @@ -2374,17 +2376,12 @@ auto VRLB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint i = 0; i < 16; i++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u8[i] = utils::rol8(a._u8[i], b._u8[i]); - } + d = gv_rol8(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2393,17 +2390,12 @@ auto VRLH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint i = 0; i < 8; i++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u16[i] = utils::rol16(a._u16[i], b._u8[i * 2] & 0xf); - } + d = gv_rol16(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2412,17 +2404,12 @@ auto VRLW() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u32[w] = utils::rol32(a._u32[w], b._u8[w * 4] & 0x1f); - } + d = gv_rol32(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2447,15 +2434,13 @@ auto VSEL() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - const auto& c = ppu.vr[op.vc]; - - d = (b & c) | gv_andn(c, a); + static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& c) + { + auto x = gv_andfs(std::move(b), c); + d = gv_orfs(std::move(x), gv_andnfs(std::move(c), std::move(a))); }; - RETURN_(ppu, op); + + RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc]); } template @@ -2464,19 +2449,12 @@ auto VSL() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - v128 VA = ppu.vr[op.va]; - u8 sh = ppu.vr[op.vb]._u8[0] & 0x7; - - d._u8[0] = VA._u8[0] << sh; - for (uint b = 1; b < 16; b++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - sh = ppu.vr[op.vb]._u8[b] & 0x7; - d._u8[b] = (VA._u8[b] << sh) | (VA._u8[b - 1] >> (8 - sh)); - } + d = gv_fshl8(std::move(a), gv_shuffle_left<1>(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2485,38 +2463,35 @@ auto VSLB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint i = 0; i < 16; i++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u8[i] = a._u8[i] << (b._u8[i] & 0x7); - } + d = gv_shl8(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } -template -auto VSLDOI() +template +struct VSLDOI { - if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select<>(); - - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - u8 tmpSRC[32]; - std::memcpy(tmpSRC, &ppu.vr[op.vb], 16); - std::memcpy(tmpSRC + 16, &ppu.vr[op.va], 16); - - for (uint b = 0; b<16; b++) + template + static auto select(bs_t selected, auto func) { - d._u8[15 - b] = tmpSRC[31 - (b + op.vsh)]; + return ppu_exec_select<>::select(selected, func); } - }; - RETURN_(ppu, op); -} + + template + static auto impl() + { + static const auto exec = [](auto&& d, auto&& a, auto&& b) + { + d = gv_or32(gv_shuffle_left(std::move(a)), gv_shuffle_right<16 - Count>(std::move(b))); + }; + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); + } +}; + template auto VSLH() @@ -2524,17 +2499,12 @@ auto VSLH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint h = 0; h < 8; h++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u16[h] = a._u16[h] << (b._u16[h] & 0xf); - } + d = gv_shl16(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2543,19 +2513,12 @@ auto VSLO() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - v128 VA = ppu.vr[op.va]; - u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf; - - d.clear(); - - for (u8 b = 0; b < 16 - nShift; b++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u8[15 - b] = VA._u8[15 - (b + nShift)]; - } + d._u = a._u << (b._u8[0] & 0x78); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2564,17 +2527,12 @@ auto VSLW() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u32[w] = a._u32[w] << (b._u32[w] & 0x1f); - } + d = gv_shl32(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2583,16 +2541,12 @@ auto VSPLTB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - u8 byte = ppu.vr[op.vb]._u8[15 - op.vuimm]; - - for (uint b = 0; b < 16; b++) + static const auto exec = [](auto&& d, auto&& b, auto&& imm) { - d._u8[b] = byte; - } + d = gv_bcst8(b.u8r[imm & 15]); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm); } template @@ -2601,18 +2555,12 @@ auto VSPLTH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - ensure((op.vuimm < 8)); - - u16 hword = ppu.vr[op.vb]._u16[7 - op.vuimm]; - - for (uint h = 0; h < 8; h++) + static const auto exec = [](auto&& d, auto&& b, auto&& imm) { - d._u16[h] = hword; - } + d = gv_bcst16(b.u16r[imm & 7]); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm); } template @@ -2621,16 +2569,12 @@ auto VSPLTISB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const s8 imm = op.vsimm; - - for (uint b = 0; b < 16; b++) + static const auto exec = [](auto&& d, auto&& imm) { - d._u8[b] = imm; - } + d = gv_bcst8(imm); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], op.vsimm); } template @@ -2639,16 +2583,12 @@ auto VSPLTISH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const s16 imm = op.vsimm; - - for (uint h = 0; h < 8; h++) + static const auto exec = [](auto&& d, auto&& imm) { - d._u16[h] = imm; - } + d = gv_bcst16(imm); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], op.vsimm); } template @@ -2657,16 +2597,12 @@ auto VSPLTISW() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const s32 imm = op.vsimm; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& imm) { - d._u32[w] = imm; - } + d = gv_bcst32(imm); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], op.vsimm); } template @@ -2675,18 +2611,12 @@ auto VSPLTW() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - ensure((op.vuimm < 4)); - - u32 word = ppu.vr[op.vb]._u32[3 - op.vuimm]; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& b, auto&& imm) { - d._u32[w] = word; - } + d = gv_bcst32(b.u32r[imm & 3]); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.vb], op.vuimm); } template @@ -2695,19 +2625,12 @@ auto VSR() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - v128 VA = ppu.vr[op.va]; - u8 sh = ppu.vr[op.vb]._u8[15] & 0x7; - - d._u8[15] = VA._u8[15] >> sh; - for (uint b = 14; ~b; b--) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - sh = ppu.vr[op.vb]._u8[b] & 0x7; - d._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b + 1] << (8 - sh)); - } + d = gv_fshr8(gv_shuffle_right<1>(a), std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2716,17 +2639,12 @@ auto VSRAB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint i = 0; i < 16; i++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._s8[i] = a._s8[i] >> (b._u8[i] & 0x7); - } + d = gv_sar8(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2735,17 +2653,12 @@ auto VSRAH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint h = 0; h < 8; h++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._s16[h] = a._s16[h] >> (b._u16[h] & 0xf); - } + d = gv_sar16(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2754,17 +2667,12 @@ auto VSRAW() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._s32[w] = a._s32[w] >> (b._u32[w] & 0x1f); - } + d = gv_sar32(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2773,17 +2681,12 @@ auto VSRB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint i = 0; i < 16; i++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u8[i] = a._u8[i] >> (b._u8[i] & 0x7); - } + d = gv_shr8(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2792,17 +2695,12 @@ auto VSRH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint h = 0; h < 8; h++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u16[h] = a._u16[h] >> (b._u16[h] & 0xf); - } + d = gv_shr16(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2811,19 +2709,12 @@ auto VSRO() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - v128 VA = ppu.vr[op.va]; - u8 nShift = (ppu.vr[op.vb]._u8[0] >> 3) & 0xf; - - d.clear(); - - for (u8 b = 0; b < 16 - nShift; b++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u8[b] = VA._u8[b + nShift]; - } + d._u = a._u >> (b._u8[0] & 0x78); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -2832,17 +2723,12 @@ auto VSRW() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); - static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& a = ppu.vr[op.va]; - const auto& b = ppu.vr[op.vb]; - - for (uint w = 0; w < 4; w++) + static const auto exec = [](auto&& d, auto&& a, auto&& b) { - d._u32[w] = a._u32[w] >> (b._u32[w] & 0x1f); - } + d = gv_shr32(std::move(a), std::move(b)); }; - RETURN_(ppu, op); + + RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb]); } template @@ -3184,30 +3070,14 @@ auto VUPKHPX() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); -#if defined(ARCH_X64_0) - static const auto make = [](asmjit::ppu_builder& c) - { - const auto [v0, v1, v2] = c.vec_alloc<3>(); - EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb)); - EMIT(psrad, v0, v0, c.imm(16)); - EMIT(pslld, v1, v0, c.imm(6)); - EMIT(pslld, v2, v0, c.imm(3)); - BCST(pand, d, v0, v0, c.get_bcst(0xff00001f)); - BCST(pand, d, v1, v1, c.get_bcst(0x1f0000)); - BCST(pand, d, v2, v2, c.get_bcst(0x1f00)); - EMIT(por, v0, v0, v1); - EMIT(por, v0, v0, v2); - LDST(movaps, c.ppu_vr(s_op.vd, true), v0); - c.ppu_ret(); - }; -#endif static const auto exec = [](auto&& d, auto&& b) { - const auto x = gv_extend_hi_s16(b); - d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)); + auto x = gv_extend_hi_s16(std::move(b)); + auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00))); + d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f))); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.vb]); } template @@ -3216,22 +3086,12 @@ auto VUPKHSB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); -#if defined(ARCH_X64_0) - static const auto make = [](asmjit::ppu_builder& c) - { - const auto v0 = c.vec_alloc(); - EMIT(punpckhbw, v0, v0, c.ppu_vr(s_op.vb)); - EMIT(psraw, v0, v0, c.imm(8)); - LDST(movaps, c.ppu_vr(s_op.vd, true), v0); - c.ppu_ret(); - }; -#endif static const auto exec = [](auto&& d, auto&& b) { - d = gv_extend_hi_s8(b); + d = gv_extend_hi_s8(std::move(b)); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.vb]); } template @@ -3240,22 +3100,12 @@ auto VUPKHSH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); -#if defined(ARCH_X64_0) - static const auto make = [](asmjit::ppu_builder& c) - { - const auto v0 = c.vec_alloc(); - EMIT(punpckhwd, v0, v0, c.ppu_vr(s_op.vb)); - EMIT(psrad, v0, v0, c.imm(16)); - LDST(movaps, c.ppu_vr(s_op.vd, true), v0); - c.ppu_ret(); - }; -#endif static const auto exec = [](auto&& d, auto&& b) { - d = gv_extend_hi_s16(b); + d = gv_extend_hi_s16(std::move(b)); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.vb]); } template @@ -3264,37 +3114,14 @@ auto VUPKLPX() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); -#if defined(ARCH_X64_0) - static const auto make = [](asmjit::ppu_builder& c) - { - const auto [v0, v1, v2] = c.vec_alloc<3>(); - if (utils::has_sse41()) - { - LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb)); - } - else - { - EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb)); - EMIT(psrad, v0, v0, c.imm(16)); - } - EMIT(pslld, v1, v0, c.imm(6)); - EMIT(pslld, v2, v0, c.imm(3)); - BCST(pand, d, v0, v0, c.get_bcst(0xff00001f)); - BCST(pand, d, v1, v1, c.get_bcst(0x1f0000)); - BCST(pand, d, v2, v2, c.get_bcst(0x1f00)); - EMIT(por, v0, v0, v1); - EMIT(por, v0, v0, v2); - LDST(movaps, c.ppu_vr(s_op.vd, true), v0); - c.ppu_ret(); - }; -#endif static const auto exec = [](auto&& d, auto&& b) { - const auto x = gv_extend_lo_s16(b); - d = gv_and32(x, gv_bcst32(0xff00001f)) | gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)) | gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00)); + auto x = gv_extend_lo_s16(std::move(b)); + auto y = gv_or32(gv_and32(gv_shl32(x, 6), gv_bcst32(0x1f0000)), gv_and32(gv_shl32(x, 3), gv_bcst32(0x1f00))); + d = gv_or32(std::move(y), gv_and32(std::move(x), gv_bcst32(0xff00001f))); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.vb]); } template @@ -3303,29 +3130,12 @@ auto VUPKLSB() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); -#if defined(ARCH_X64_0) - static const auto make = [](asmjit::ppu_builder& c) - { - const auto v0 = c.vec_alloc(); - if (utils::has_sse41()) - { - LDST(pmovsxbw, v0, c.ppu_vr<8>(s_op.vb)); - } - else - { - EMIT(punpcklbw, v0, v0, c.ppu_vr(s_op.vb)); - EMIT(psraw, v0, v0, c.imm(8)); - } - LDST(movaps, c.ppu_vr(s_op.vd, true), v0); - c.ppu_ret(); - }; -#endif static const auto exec = [](auto&& d, auto&& b) { - d = gv_extend_lo_s8(b); + d = gv_extend_lo_s8(std::move(b)); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.vb]); } template @@ -3334,29 +3144,12 @@ auto VUPKLSH() if constexpr (Build == 0xf1a6) return ppu_exec_select::template select<>(); -#if defined(ARCH_X64_0) - static const auto make = [](asmjit::ppu_builder& c) - { - const auto v0 = c.vec_alloc(); - if (utils::has_sse41()) - { - LDST(pmovsxwd, v0, c.ppu_vr<8>(s_op.vb)); - } - else - { - EMIT(punpcklwd, v0, v0, c.ppu_vr(s_op.vb)); - EMIT(psrad, v0, v0, c.imm(16)); - } - LDST(movaps, c.ppu_vr(s_op.vd, true), v0); - c.ppu_ret(); - }; -#endif static const auto exec = [](auto&& d, auto&& b) { - d = gv_extend_lo_s16(b); + d = gv_extend_lo_s16(std::move(b)); }; - RETURN_(ppu.vr[op.vd], ppu.vr[op.vb]); + RETURN(ppu.vr[op.vd], ppu.vr[op.vb]); } template @@ -7157,7 +6950,8 @@ struct ppu_interpreter_t IT VSEL; IT VSL; IT VSLB; - IT VSLDOI; + IT VSLDOI{}; + IT VSLDOI_[16]; IT VSLH; IT VSLO; IT VSLW; @@ -7629,6 +7423,27 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept return ::name<0, Flags...>(); \ }); \ +#define INIT_ONE(name, bits) \ + ptrs->name##_[0b##bits] = ::name<0b##bits>::select(selected, []() { \ + return ::name<0b##bits>::impl<0, Flags...>(); \ + }); \ + +#define INIT_PACK2(name, bits) \ + INIT_ONE(name, bits##0) \ + INIT_ONE(name, bits##1) \ + +#define INIT_PACK4(name, bits) \ + INIT_PACK2(name, bits##0) \ + INIT_PACK2(name, bits##1) \ + +#define INIT_PACK8(name, bits) \ + INIT_PACK4(name, bits##0) \ + INIT_PACK4(name, bits##1) \ + +#define INIT_PACK16(name, bits) \ + INIT_PACK8(name, bits##0) \ + INIT_PACK8(name, bits##1) \ + INIT(MFVSCR); INIT(MTVSCR); INIT(VADDCUW); @@ -7732,7 +7547,7 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept INIT(VSEL); INIT(VSL); INIT(VSLB); - INIT(VSLDOI); + INIT_PACK16(VSLDOI,); INIT(VSLH); INIT(VSLO); INIT(VSLW); @@ -8051,6 +7866,7 @@ ppu_intrp_func_t ppu_interpreter_rt::decode(u32 opv) const noexcept break; } + case ppu_itype::VSLDOI: return ptrs->VSLDOI_[op.vsh]; default: break; } diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index 5a989a0a60..5efe3c0fbb 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -193,7 +193,7 @@ namespace utils #elif defined(__clang__) return __builtin_rotateleft32(x, n); #else - return (x << n) | (x >> (32 - n)); + return (x << (n & 31)) | (x >> (((0 - n) & 31))); #endif } @@ -209,7 +209,7 @@ namespace utils #elif defined(__clang__) return __builtin_rotateleft64(x, n); #else - return (x << n) | (x >> (64 - n)); + return (x << (n & 63)) | (x >> (((0 - n) & 63))); #endif } diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index c3c1b52ff8..d04ab111db 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -3,6 +3,7 @@ #include "util/types.hpp" #include "util/v128.hpp" #include "util/sysinfo.hpp" +#include "util/asm.hpp" #include "Utilities/JIT.h" #if defined(ARCH_X64) @@ -40,6 +41,7 @@ namespace asmjit #else struct gpr_type : Operand { + gpr_type() = default; gpr_type(u32) { } @@ -47,6 +49,7 @@ namespace asmjit struct vec_type : Operand { + vec_type() = default; vec_type(u32) { } @@ -82,7 +85,7 @@ namespace asmjit template > constexpr arg_class arg_classify = - std::is_base_of_v ? arg_class::imm_lv + !std::is_reference_v : + std::is_same_v ? arg_class::imm_lv + !std::is_reference_v : std::is_base_of_v ? arg_class::mem_lv : std::is_base_of_v ? arg_class::mem_lv + !std::is_reference_v : std::is_reference_v ? arg_class::reg_lv : arg_class::reg_rv; @@ -91,6 +94,8 @@ namespace asmjit { using base = native_asm; + bool fail_flag = false; + vec_builder(CodeHolder* ch) : native_asm(ch) { @@ -150,6 +155,9 @@ namespace asmjit std::unordered_map consts[16]{}; +#if defined(ARCH_X64) + std::unordered_map const_allocs{}; + template x86::Mem get_const(const T& data, u32 esize = Size) { @@ -180,14 +188,97 @@ namespace asmjit return x86::Mem(_label, 0, Size); } +#endif + }; + + struct free_on_exit + { + Operand x{}; + + free_on_exit() = default; + free_on_exit(const free_on_exit&) = delete; + free_on_exit& operator=(const free_on_exit&) = delete; + + ~free_on_exit() + { + if (x.isReg()) + { + vec_type v; + v.copyFrom(x); + g_vc->vec_dealloc(v); + } + } }; #if defined(ARCH_X64) - inline auto arg_eval(const v128& _c, u32 esize) + inline Operand arg_eval(v128& _c, u32 esize) { - // TODO: implement PSHUFD broadcasts and AVX ones - auto r = g_vc->get_const(_c, esize); - return r; + const auto found = g_vc->const_allocs.find(_c); + + if (found != g_vc->const_allocs.end()) + { + return found->second; + } + + vec_type reg = g_vc->vec_alloc(); + + // TODO: PSHUFD style broadcast? Needs known const layout + if (utils::has_avx() && _c._u64[0] == _c._u64[1]) + { + if (_c._u32[0] == _c._u32[1]) + { + if (utils::has_avx2() && _c._u16[0] == _c._u16[1]) + { + if (_c._u8[0] == _c._u8[1]) + { + ensure(!g_vc->vpbroadcastb(reg, g_vc->get_const(_c._u8[0]))); + } + else + { + ensure(!g_vc->vpbroadcastw(reg, g_vc->get_const(_c._u16[0]))); + } + } + else + { + ensure(!g_vc->vbroadcastss(reg, g_vc->get_const(_c._u32[0]))); + } + } + else + { + ensure(!g_vc->vbroadcastsd(reg, g_vc->get_const(_c._u32[0]))); + } + } + else if (!_c._u) + { + ensure(!g_vc->pxor(reg, reg)); + } + else if (!~_c._u) + { + ensure(!g_vc->pcmpeqd(reg, reg)); + } + else + { + ensure(!g_vc->movaps(reg, g_vc->get_const(_c, esize))); + } + + g_vc->const_allocs.emplace(_c, reg); + return reg; + } + + inline Operand arg_eval(v128&& _c, u32 esize) + { + const auto found = g_vc->const_allocs.find(_c); + + if (found != g_vc->const_allocs.end()) + { + vec_type r = found->second; + g_vc->const_allocs.erase(found); + g_vc->vec_dealloc(r); + return r; + } + + // Hack: assume can use mem op (TODO) + return g_vc->get_const(_c, esize); } template requires(std::is_base_of_v>) @@ -211,12 +302,24 @@ namespace asmjit return std::move(mem); } + inline void arg_free(const v128&) + { + } + + inline void arg_free(const Operand& op) + { + if (op.isReg()) + { + g_vc->vec_dealloc(vec_type{op.id()}); + } + } + template inline bool arg_use_evex(const auto& op) { constexpr auto _class = arg_classify; if constexpr (_class == arg_class::imm_rv) - return true; + return g_vc->const_allocs.count(op) == 0; else if constexpr (_class == arg_class::imm_lv) return false; else if (op.isMem()) @@ -302,6 +405,7 @@ namespace asmjit template vec_type binary_op(u32 esize, x86::Inst::Id mov_op, x86::Inst::Id sse_op, x86::Inst::Id avx_op, x86::Inst::Id evex_op, A&& a, B&& b, Args&&... args) { + free_on_exit e; Operand src1{}; if constexpr (arg_classify == arg_class::reg_rv) @@ -317,12 +421,13 @@ namespace asmjit if constexpr (arg_classify == arg_class::reg_rv) { - g_vc->vec_dealloc(vec_type{b.id()}); - //b = Operand(); + e.x = b; } } else if (utils::has_avx() && avx_op && (arg_classify == arg_class::reg_lv || arg_classify == arg_class::mem_lv)) { + Operand srca = arg_eval(std::forward(a), 16); + if constexpr (arg_classify == arg_class::reg_lv) { if constexpr (arg_classify == arg_class::reg_rv) @@ -336,47 +441,79 @@ namespace asmjit src1 = g_vc->vec_alloc(); } } - else // if A == arg_class::reg_rv + else { src1 = g_vc->vec_alloc(); - if (!a.isReg()) - { - static_cast(arg_eval(std::forward(a), 16)); - } - if constexpr (arg_classify == arg_class::reg_rv) { - g_vc->vec_dealloc(vec_type{b.id()}); - //b = Operand(); + e.x = b; } } if (utils::has_avx512() && evex_op && arg_use_evex(b)) { - ensure(!g_vc->evex().emit(evex_op, src1, vec_type{a.id()}, arg_eval(std::forward(b), esize), std::forward(args)...)); + ensure(!g_vc->evex().emit(evex_op, src1, srca, arg_eval(std::forward(b), esize), std::forward(args)...)); return vec_type{src1.id()}; } - ensure(!g_vc->emit(avx_op, src1, vec_type{a.id()}, arg_eval(std::forward(b), 16), std::forward(args)...)); + ensure(!g_vc->emit(avx_op, src1, srca, arg_eval(std::forward(b), 16), std::forward(args)...)); return vec_type{src1.id()}; } else do { - if constexpr (arg_classify == arg_class::reg_rv) + if constexpr (arg_classify == arg_class::mem_rv) { - g_vc->vec_dealloc(vec_type{b.id()}); - //b = Operand(); + if (a.isReg()) + { + src1 = vec_type(a.id()); + + if constexpr (arg_classify == arg_class::reg_rv) + { + e.x = b; + } + break; + } } - if (arg_classify == arg_class::mem_rv && a.isReg()) + if constexpr (arg_classify == arg_class::imm_rv) { - src1 = vec_type(a.id()); - break; + if (auto found = g_vc->const_allocs.find(a); found != g_vc->const_allocs.end()) + { + src1 = found->second; + g_vc->const_allocs.erase(found); + + if constexpr (arg_classify == arg_class::reg_rv) + { + e.x = b; + } + break; + } } src1 = g_vc->vec_alloc(); + if constexpr (arg_classify == arg_class::reg_rv) + { + e.x = b; + } + + if constexpr (arg_classify == arg_class::imm_rv) + { + if (!a._u) + { + // All zeros + ensure(!g_vc->emit(x86::Inst::kIdPxor, src1, src1)); + break; + } + else if (!~a._u) + { + // All ones + ensure(!g_vc->emit(x86::Inst::kIdPcmpeqd, src1, src1)); + break; + } + } + // Fallback to arg copy ensure(!g_vc->emit(mov_op, src1, arg_eval(std::forward(a), 16))); } @@ -404,10 +541,14 @@ namespace asmjit } inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false); +inline v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false); inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false); inline v128 gv_select32(const v128& _cmp, const v128& _true, const v128& _false); inline v128 gv_selectfs(const v128& _cmp, const v128& _true, const v128& _false); +template requires (asmjit::any_operand_v) +inline asmjit::vec_type gv_gts32(A&&, B&&); + inline void gv_set_zeroing_denormals() { #if defined(ARCH_X64) @@ -704,6 +845,16 @@ inline v128 gv_not32(const v128& a) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_not32(A&& a) +{ +#if defined(ARCH_X64) + asmjit::vec_type ones = g_vc->vec_alloc(); + g_vc->pcmpeqd(ones, ones); + FOR_X64(binary_op, 4, kIdMovdqa, kIdPxor, kIdVpxor, kIdVpxord, std::move(ones), std::forward(a)); +#endif +} + inline v128 gv_notfs(const v128& a) { #if defined(ARCH_X64) @@ -713,6 +864,16 @@ inline v128 gv_notfs(const v128& a) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_notfs(A&& a) +{ +#if defined(ARCH_X64) + asmjit::vec_type ones = g_vc->vec_alloc(); + g_vc->pcmpeqd(ones, ones); + FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::move(ones), std::forward(a)); +#endif +} + inline v128 gv_shl16(const v128& a, u32 count) { if (count >= 16) @@ -724,7 +885,7 @@ inline v128 gv_shl16(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_shl16(A&& a, u32 count) { FOR_X64(unary_op, kIdPsllw, kIdVpsllw, std::forward(a), count); @@ -741,7 +902,7 @@ inline v128 gv_shl32(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_shl32(A&& a, u32 count) { FOR_X64(unary_op, kIdPslld, kIdVpslld, std::forward(a), count); @@ -758,7 +919,7 @@ inline v128 gv_shl64(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_shl64(A&& a, u32 count) { FOR_X64(unary_op, kIdPsllq, kIdVpsllq, std::forward(a), count); @@ -775,7 +936,7 @@ inline v128 gv_shr16(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_shr16(A&& a, u32 count) { FOR_X64(unary_op, kIdPsrlw, kIdVpsrlw, std::forward(a), count); @@ -792,7 +953,7 @@ inline v128 gv_shr32(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_shr32(A&& a, u32 count) { FOR_X64(unary_op, kIdPsrld, kIdVpsrld, std::forward(a), count); @@ -809,7 +970,7 @@ inline v128 gv_shr64(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_shr64(A&& a, u32 count) { FOR_X64(unary_op, kIdPsrlq, kIdVpsrlq, std::forward(a), count); @@ -826,7 +987,7 @@ inline v128 gv_sar16(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_sar16(A&& a, u32 count) { FOR_X64(unary_op, kIdPsraw, kIdVpsraw, std::forward(a), count); @@ -843,7 +1004,7 @@ inline v128 gv_sar32(const v128& a, u32 count) #endif } -template requires(asmjit::any_operand_v) +template requires (asmjit::any_operand_v) inline auto gv_sar32(A&& a, u32 count) { FOR_X64(unary_op, kIdPsrad, kIdVpsrad, std::forward(a), count); @@ -867,6 +1028,20 @@ inline v128 gv_sar64(const v128& a, u32 count) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_sar64(A&& a, u32 count) +{ + if (count >= 64) + count = 63; +#if defined(ARCH_X64) + using enum asmjit::x86::Inst::Id; + if (utils::has_avx512()) + return asmjit::unary_op(kIdNone, kIdVpsraq, std::forward(a), count); + g_vc->fail_flag = true; + return std::forward(a); +#endif +} + inline v128 gv_add8(const v128& a, const v128& b) { #if defined(ARCH_X64) @@ -1025,6 +1200,20 @@ inline v128 gv_addus_u32(const v128& a, const v128& b) #endif } +template requires (asmjit::any_operand_v) +inline asmjit::vec_type gv_addus_u32(A&& a, B&& b) +{ +#if defined(ARCH_X64) + if (utils::has_sse41()) + return gv_add32(gv_minu32(std::forward(b), gv_not32(a)), std::forward(a)); + auto s = gv_add32(a, b); + auto x = gv_xor32(std::forward(b), gv_bcst32(0x80000000)); + auto y = gv_xor32(std::forward(a), gv_bcst32(0x7fffffff)); + return gv_or32(std::move(s), gv_gts32(std::move(x), std::move(y))); +#endif + return {}; +} + inline v128 gv_addfs(const v128& a, const v128& b) { #if defined(ARCH_X64) @@ -1052,6 +1241,12 @@ inline v128 gv_sub8(const v128& a, const v128& b) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_sub8(A&& a, B&& b) +{ + FOR_X64(binary_op, 1, kIdMovdqa, kIdPsubb, kIdVpsubb, kIdNone, std::forward(a), std::forward(b)); +} + inline v128 gv_sub16(const v128& a, const v128& b) { #if defined(ARCH_X64) @@ -1265,6 +1460,21 @@ inline v128 gv_minu32(const v128& a, const v128& b) #endif } +template requires (asmjit::any_operand_v) +inline asmjit::vec_type gv_minu32(A&& a, B&& b) +{ +#if defined(ARCH_X64) + if (utils::has_sse41()) + FOR_X64(binary_op, 4, kIdMovdqa, kIdPminud, kIdVpminud, kIdVpminud, std::forward(a), std::forward(b)); + auto s = gv_bcst32(0x80000000); + auto x = gv_xor32(a, s); + auto m = gv_gts32(std::move(x), gv_xor32(std::move(s), b)); + auto z = gv_and32(m, std::move(b)); + return gv_or32(std::move(z), gv_andn32(std::move(m), std::move(a))); +#endif + return {}; +} + inline v128 gv_mins8(const v128& a, const v128& b) { #if defined(__SSE4_1__) @@ -1493,6 +1703,13 @@ inline v128 gv_gts8(const v128& a, const v128& b) #endif } +template requires (asmjit::any_operand_v) +inline asmjit::vec_type gv_gts8(A&& a, B&& b) +{ + FOR_X64(binary_op, 1, kIdMovdqa, kIdPcmpgtb, kIdVpcmpgtb, kIdNone, std::forward(a), std::forward(b)); + return {}; +} + inline v128 gv_gts16(const v128& a, const v128& b) { #if defined(ARCH_X64) @@ -1511,6 +1728,13 @@ inline v128 gv_gts32(const v128& a, const v128& b) #endif } +template requires (asmjit::any_operand_v) +inline asmjit::vec_type gv_gts32(A&& a, B&& b) +{ + FOR_X64(binary_op, 4, kIdMovdqa, kIdPcmpgtd, kIdVpcmpgtd, kIdNone, std::forward(a), std::forward(b)); + return {}; +} + inline v128 gv_avgu8(const v128& a, const v128& b) { #if defined(ARCH_X64) @@ -2154,7 +2378,7 @@ inline v128 gv_andn(const v128& a, const v128& b) } // Select elements; _cmp must be result of SIMD comparison; undefined otherwise -inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false) +FORCE_INLINE v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false) { #if defined(__SSE4_1__) return _mm_blendv_epi8(_false, _true, _cmp); @@ -2165,6 +2389,45 @@ inline v128 gv_select8(const v128& _cmp, const v128& _true, const v128& _false) #endif } +// Select elements using sign bit only +FORCE_INLINE v128 gv_signselect8(const v128& bits, const v128& _true, const v128& _false) +{ +#if defined(__SSE4_1__) + return _mm_blendv_epi8(_false, _true, bits); +#else + return gv_select8(gv_gts8(gv_bcst8(0), bits), _true, _false); +#endif +} + +template requires (asmjit::any_operand_v) +inline asmjit::vec_type gv_signselect8(A&& bits, B&& _true, C&& _false) +{ + using namespace asmjit; +#if defined(ARCH_X64) + if (utils::has_avx()) + { + Operand arg0{}; + Operand arg1 = arg_eval(std::forward(bits), 16); + Operand arg2 = arg_eval(std::forward(_true), 16); + Operand arg3 = arg_eval(std::forward(_false), 16); + if constexpr (!std::is_reference_v) + arg0.isReg() ? arg_free(bits) : arg0.copyFrom(arg1); + if constexpr (!std::is_reference_v) + arg0.isReg() ? arg_free(_true) : arg0.copyFrom(arg2); + if constexpr (!std::is_reference_v) + arg0.isReg() ? arg_free(_false) : arg0.copyFrom(arg3); + if (arg0.isNone()) + arg0 = g_vc->vec_alloc(); + g_vc->emit(x86::Inst::kIdVpblendvb, arg0, arg3, arg2, arg1); + vec_type r; + r.copyFrom(arg0); + return r; + } +#endif + g_vc->fail_flag = true; + return vec_type{0}; +} + // Select elements; _cmp must be result of SIMD comparison; undefined otherwise inline v128 gv_select16(const v128& _cmp, const v128& _true, const v128& _false) { @@ -2305,6 +2568,17 @@ inline v128 gv_extend_lo_s8(const v128& vec) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_extend_lo_s8(A&& a) +{ +#if defined(ARCH_X64) + using enum asmjit::x86::Inst::Id; + if (utils::has_sse41()) + return asmjit::unary_op(kIdPmovsxbw, kIdVpmovsxbw, std::forward(a)); + return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpcklbw, std::forward(a)), 8); +#endif +} + inline v128 gv_extend_hi_s8(const v128& vec) { #if defined(__SSE4_1__) @@ -2316,6 +2590,15 @@ inline v128 gv_extend_hi_s8(const v128& vec) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_extend_hi_s8(A&& a) +{ +#if defined(ARCH_X64) + using enum asmjit::x86::Inst::Id; + return asmjit::unary_op(kIdPsraw, kIdVpsraw, asmjit::unary_op(kIdNone, kIdPunpckhbw, std::forward(a)), 8); +#endif +} + inline v128 gv_unpacklo16(const v128& lows, const v128& highs) { #if defined(ARCH_X64) @@ -2336,6 +2619,17 @@ inline v128 gv_extend_lo_s16(const v128& vec) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_extend_lo_s16(A&& a) +{ +#if defined(ARCH_X64) + using enum asmjit::x86::Inst::Id; + if (utils::has_sse41()) + return asmjit::unary_op(kIdPmovsxwd, kIdVpmovsxwd, std::forward(a)); + return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpcklwd, std::forward(a)), 16); +#endif +} + inline v128 gv_extend_hi_s16(const v128& vec) { #if defined(__SSE4_1__) @@ -2347,6 +2641,15 @@ inline v128 gv_extend_hi_s16(const v128& vec) #endif } +template requires (asmjit::any_operand_v) +inline auto gv_extend_hi_s16(A&& a) +{ +#if defined(ARCH_X64) + using enum asmjit::x86::Inst::Id; + return asmjit::unary_op(kIdPsrad, kIdVpsrad, asmjit::unary_op(kIdNone, kIdPunpckhwd, std::forward(a)), 16); +#endif +} + inline v128 gv_unpacklo32(const v128& lows, const v128& highs) { #if defined(ARCH_X64) @@ -2471,3 +2774,280 @@ inline v128 gv_log2_approxfs(const v128& a) return r; #endif } + +// For each 8-bit element, r = a << (b & 7) +inline v128 gv_shl8(const v128& a, const v128& b) +{ +#if defined(ARCH_ARM64) + return vshlq_u8(a, vandq_s8(b, gv_bcst8(7))); +#else + const v128 x1 = gv_add8(a, a); // shift left by 1 + const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); + const v128 x2 = gv_and32(gv_shl64(r1, 2), gv_bcst8(0xfc)); // shift by 2 + const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); + const v128 x3 = gv_and32(gv_shl64(r2, 4), gv_bcst8(0xf0)); // shift by 4 + return gv_signselect8(gv_shl64(b, 5), x3, r2); +#endif +} + +// For each 16-bit element, r = a << (b & 15) +inline v128 gv_shl16(const v128& a, const v128& b) +{ +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_sllv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); +#elif defined(ARCH_ARM64) + return vshlq_u16(a, vandq_s16(b, gv_bcst8(15))); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = a._u16[i] << (b._u16[i] & 15); + return r; +#endif +} + +// For each 32-bit element, r = a << (b & 31) +inline v128 gv_shl32(const v128& a, const v128& b) +{ +#if defined(__AVX2__) + return _mm_sllv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); +#elif defined(ARCH_ARM64) + return vshlq_u32(a, vandq_s32(b, gv_bcst8(31))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = a._u32[i] << (b._u32[i] & 31); + return r; +#endif +} + +// For each unsigned 8-bit element, r = a >> (b & 7) +inline v128 gv_shr8(const v128& a, const v128& b) +{ +#if defined(ARCH_ARM64) + return vshlq_u8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7)))); +#else + const v128 x1 = gv_and32(gv_shr64(a, 1), gv_bcst8(0x7f)); // shift right by 1 + const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); + const v128 x2 = gv_and32(gv_shr64(r1, 2), gv_bcst8(0x3f)); // shift by 2 + const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); + const v128 x3 = gv_and32(gv_shr64(r2, 4), gv_bcst8(0x0f)); // shift by 4 + return gv_signselect8(gv_shl64(b, 5), x3, r2); +#endif +} + +// For each unsigned 16-bit element, r = a >> (b & 15) +inline v128 gv_shr16(const v128& a, const v128& b) +{ +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_srlv_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); +#elif defined(ARCH_ARM64) + return vshlq_u16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15)))); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = a._u16[i] >> (b._u16[i] & 15); + return r; +#endif +} + +// For each unsigned 32-bit element, r = a >> (b & 31) +inline v128 gv_shr32(const v128& a, const v128& b) +{ +#if defined(__AVX2__) + return _mm_srlv_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); +#elif defined(ARCH_ARM64) + return vshlq_u32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31)))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = a._u32[i] >> (b._u32[i] & 31); + return r; +#endif +} + +// For each signed 8-bit element, r = a >> (b & 7) +inline v128 gv_sar8(const v128& a, const v128& b) +{ +#if defined(ARCH_ARM64) + return vshlq_s8(a, vnegq_s8(vandq_s8(b, gv_bcst8(7)))); +#else + v128 r; + for (u32 i = 0; i < 16; i++) + r._s8[i] = a._s8[i] >> (b._s8[i] & 7); + return r; +#endif +} + +// For each signed 16-bit element, r = a >> (b & 15) +inline v128 gv_sar16(const v128& a, const v128& b) +{ +#if defined(__AVX512VL__) && defined(__AVX512BW__) + return _mm_srav_epi16(a, _mm_and_si128(b, _mm_set1_epi16(15))); +#elif defined(ARCH_ARM64) + return vshlq_s16(a, vnegq_s16(vandq_s16(b, gv_bcst8(15)))); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._s16[i] = a._s16[i] >> (b._s16[i] & 15); + return r; +#endif +} + +// For each signed 32-bit element, r = a >> (b & 31) +inline v128 gv_sar32(const v128& a, const v128& b) +{ +#if defined(__AVX2__) + return _mm_srav_epi32(a, _mm_and_si128(b, _mm_set1_epi32(31))); +#elif defined(ARCH_ARM64) + return vshlq_s32(a, vnegq_s32(vandq_s32(b, gv_bcst8(31)))); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._s32[i] = a._s32[i] >> (b._s32[i] & 31); + return r; +#endif +} + +// For each 8-bit element, r = rotate a by b +inline v128 gv_rol8(const v128& a, const v128& b) +{ +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s8(b, gv_bcst8(7)); + const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); + return vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(a, amt2)); +#else + const v128 x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), a)); // rotate left by 1 + const v128 r1 = gv_signselect8(gv_shl64(b, 7), x1, a); + const v128 c2 = gv_bcst8(0x3); + const v128 x2 = gv_or32(gv_and32(gv_shr64(r1, 6), c2), gv_andn32(c2, gv_shl64(r1, 2))); // rotate by 2 + const v128 r2 = gv_signselect8(gv_shl64(b, 6), x2, r1); + const v128 c3 = gv_bcst8(0xf); + const v128 x3 = gv_or32(gv_and32(gv_shr64(r2, 4), c3), gv_andn32(c3, gv_shl64(r2, 4))); // rotate by 4 + return gv_signselect8(gv_shl64(b, 5), x3, r2); +#endif +} + +// For each 16-bit element, r = rotate a by b +inline v128 gv_rol16(const v128& a, const v128& b) +{ +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s16(b, gv_bcst16(15)); + const auto amt2 = vsubq_s16(amt1, gv_bcst16(16)); + return vorrq_u16(vshlq_u16(a, amt1), vshlq_u16(a, amt2)); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = utils::rol16(a._u16[i], b._u16[i]); + return r; +#endif +} + +// For each 32-bit element, r = rotate a by b +inline v128 gv_rol32(const v128& a, const v128& b) +{ +#if defined(__AVX512VL__) + return _mm_rolv_epi32(a, b); +#elif defined(ARCH_ARM64) + const auto amt1 = vandq_s32(b, gv_bcst32(31)); + const auto amt2 = vsubq_s32(amt1, gv_bcst32(32)); + return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2)); +#else + v128 r; + for (u32 i = 0; i < 4; i++) + r._u32[i] = utils::rol32(a._u32[i], b._u32[i]); + return r; +#endif +} + +// For each 8-bit element, r = (a << (c & 7)) | (b >> (~c & 7) >> 1) +template +inline auto gv_fshl8(A&& a, B&& b, C&& c) +{ +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s8(c, gv_bcst8(7)); + const auto amt2 = vsubq_s8(amt1, gv_bcst8(8)); + return v128(vorrq_u8(vshlq_u8(a, amt1), vshlq_u8(b, amt2))); +#else + auto x1 = gv_sub8(gv_add8(a, a), gv_gts8(gv_bcst8(0), b)); + auto s1 = gv_shl64(c, 7); + auto r1 = gv_signselect8(s1, std::move(x1), std::forward(a)); + auto b1 = gv_signselect8(std::move(s1), gv_shl64(b, 1), std::forward(b)); + auto c2 = gv_bcst8(0x3); + auto x2 = gv_and32(gv_shr64(b1, 6), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(r1, 2))); + auto s2 = gv_shl64(c, 6); + auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1)); + auto b2 = gv_signselect8(std::move(s2), gv_shl64(b1, 2), std::move(b1)); + auto c3 = gv_bcst8(0xf); + auto x3 = gv_and32(gv_shr64(std::move(b2), 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(r2, 4))); + return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2)); +#endif +} + +// For each 8-bit element, r = (b >> (c & 7)) | (a << (~c & 7) << 1) +template +inline auto gv_fshr8(A&& a, B&& b, C&& c) +{ +#if defined(ARCH_ARM64) + const auto amt1 = vandq_s8(c, gv_bcst8(7)); + const auto amt2 = vsubq_s8(gv_bcst8(8), amt1); + return vorrq_u8(vshlq_u8(b, vnegq_s8(amt1)), vshlq_u8(a, amt2)); +#else + auto c1 = gv_bcst8(0x7f); + auto x1 = gv_and32(gv_shr64(b, 1), c1); x1 = gv_or32(std::move(x1), gv_andn32(std::move(c1), gv_shl64(a, 7))); + auto s1 = gv_shl64(c, 7); + auto r1 = gv_signselect8(s1, std::move(x1), std::move(b)); + auto a1 = gv_signselect8(std::move(s1), gv_shr64(a, 1), std::move(a)); + auto c2 = gv_bcst8(0x3f); + auto x2 = gv_and32(gv_shr64(r1, 2), c2); x2 = gv_or32(std::move(x2), gv_andn32(std::move(c2), gv_shl64(a1, 6))); + auto s2 = gv_shl64(c, 6); + auto r2 = gv_signselect8(s2, std::move(x2), std::move(r1)); + auto a2 = gv_signselect8(std::move(s2), gv_shr64(a1, 2), std::move(a1)); + auto c3 = gv_bcst8(0x0f); + auto x3 = gv_and32(gv_shr64(r2, 4), c3); x3 = gv_or32(std::move(x3), gv_andn32(std::move(c3), gv_shl64(std::move(a2), 4))); + return gv_signselect8(gv_shl64(std::move(c), 5), std::move(x3), std::move(r2)); +#endif +} + +// Shift left by byte amount +template +inline v128 gv_shuffle_left(const v128& a) +{ + if (Count > 15) + return {}; +#if defined(ARCH_X64) + return _mm_slli_si128(a, Count); +#elif defined(ARCH_ARM64) + v128 idx; + for (u32 i = 0; i < 16; i++) + idx._u8[i] = u8(i - Count); + return vqtbl1q_u8(a, idx); +#endif +} + +template requires (asmjit::any_operand_v) +inline auto gv_shuffle_left(A&& a) +{ + FOR_X64(unary_op, kIdPslldq, kIdVpslldq, std::forward(a), Count); +} + +// Shift right by byte amount +template +inline v128 gv_shuffle_right(const v128& a) +{ + if (Count > 15) + return {}; +#if defined(ARCH_X64) + return _mm_srli_si128(a, Count); +#elif defined(ARCH_ARM64) + v128 idx; + for (u32 i = 0; i < 16; i++) + idx._u8[i] = u8(i + Count); + return vqtbl1q_u8(a, idx); +#endif +} + +template requires (asmjit::any_operand_v) +inline auto gv_shuffle_right(A&& a) +{ + FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward(a), Count); +}