From 11a1f090d3927d2450b8265318efa345b6d2b1c8 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 3 Sep 2022 15:51:37 +0300 Subject: [PATCH] BufferUtils: simd_builder refactoring Some simplifications implemented. --- Utilities/JIT.cpp | 118 ++++++++------------------- Utilities/JIT.h | 79 ++++++++++++------ rpcs3/Emu/RSX/Common/BufferUtils.cpp | 42 ++-------- 3 files changed, 99 insertions(+), 140 deletions(-) diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index 7f59925f75..bbbb924302 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -357,7 +357,7 @@ asmjit::inline_runtime::~inline_runtime() asmjit::simd_builder::simd_builder(CodeHolder* ch) noexcept : native_asm(ch) { - _init(true); + _init(0); consts[~v128()] = this->newLabel(); } @@ -365,9 +365,9 @@ asmjit::simd_builder::~simd_builder() { } -void asmjit::simd_builder::_init(bool full) +void asmjit::simd_builder::_init(uint new_vsize) { - if (full && utils::has_avx512_icl()) + if ((!new_vsize && utils::has_avx512_icl()) || new_vsize == 64) { v0 = x86::zmm0; v1 = x86::zmm1; @@ -377,7 +377,7 @@ void asmjit::simd_builder::_init(bool full) v5 = x86::zmm5; vsize = 64; } - else if (full && utils::has_avx2()) + else if ((!new_vsize && utils::has_avx2()) || new_vsize == 32) { v0 = x86::ymm0; v1 = x86::ymm1; @@ -395,10 +395,10 @@ void asmjit::simd_builder::_init(bool full) v3 = x86::xmm3; v4 = x86::xmm4; v5 = x86::xmm5; - vsize = 16; + vsize = new_vsize ? new_vsize : 16; } - if (full && utils::has_avx512()) + if (!new_vsize && utils::has_avx512()) { vmask = -1; } @@ -480,6 +480,10 @@ void asmjit::simd_builder::vec_clobbering_test(u32 esize, const Operand& v, cons { this->emit(x86::Inst::kIdVptest, v, rhs); } + else if (esize == 16 && utils::has_avx()) + { + this->emit(x86::Inst::kIdVptest, v, rhs); + } else if (esize == 16 && utils::has_sse41()) { this->emit(x86::Inst::kIdPtest, v, rhs); @@ -636,7 +640,7 @@ void asmjit::simd_builder::_vec_binary_op(x86::Inst::Id sse_op, x86::Inst::Id ve { if (utils::has_avx()) { - if (vex_op == x86::Inst::kIdNone || this->_extraReg.isReg()) + if (evex_op != x86::Inst::kIdNone && (vex_op == x86::Inst::kIdNone || this->_extraReg.isReg() || vsize >= 64)) { this->evex().emit(evex_op, dst, lhs, rhs); } @@ -694,92 +698,42 @@ void asmjit::simd_builder::vec_umax(u32 esize, const Operand& dst, const Operand fmt::throw_exception("Unimplemented"); } -void asmjit::simd_builder::vec_umin_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp) +void asmjit::simd_builder::vec_extract_high(u32, const Operand& dst, const Operand& src) { - using enum x86::Inst::Id; - if (!utils::has_sse41()) - { - fmt::throw_exception("Unimplemented"); - } - - ensure(src != tmp); - - if (esize == 2) - { - this->emit(utils::has_avx() ? kIdVphminposuw : kIdPhminposuw, x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->emit(utils::has_avx() ? kIdVpextrw : kIdPextrw, dst, x86::Xmm(tmp.id()), Imm(0)); - } - else if (esize == 4) - { - if (utils::has_avx()) - { - this->vpsrldq(x86::Xmm(tmp.id()), x86::Xmm(src.id()), 8); - this->vpminud(x86::Xmm(tmp.id()), x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->vpsrldq(x86::Xmm(src.id()), x86::Xmm(tmp.id()), 4); - this->vpminud(x86::Xmm(src.id()), x86::Xmm(src.id()), x86::Xmm(tmp.id())); - this->vmovd(dst.r32(), x86::Xmm(src.id())); - } - else - { - this->movdqa(x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->psrldq(x86::Xmm(tmp.id()), 8); - this->pminud(x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->movdqa(x86::Xmm(src.id()), x86::Xmm(tmp.id())); - this->psrldq(x86::Xmm(src.id()), 4); - this->pminud(x86::Xmm(src.id()), x86::Xmm(tmp.id())); - this->movd(dst.r32(), x86::Xmm(src.id())); - } - } + if (vsize == 32) + this->vextracti32x8(x86::Ymm(dst.id()), x86::Zmm(src.id()), 1); + else if (vsize == 16) + this->vextracti128(x86::Xmm(dst.id()), x86::Ymm(src.id()), 1); else { - fmt::throw_exception("Unimplemented"); + if (utils::has_avx()) + this->vpsrldq(x86::Xmm(dst.id()), x86::Xmm(src.id()), vsize); + else + { + this->movdqa(x86::Xmm(dst.id()), x86::Xmm(src.id())); + this->psrldq(x86::Xmm(dst.id()), vsize); + } } } -void asmjit::simd_builder::vec_umax_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp) +void asmjit::simd_builder::vec_extract_gpr(u32 esize, const x86::Gp& dst, const Operand& src) { - using enum x86::Inst::Id; - if (!utils::has_sse41()) - { - fmt::throw_exception("Unimplemented"); - } - - ensure(src != tmp); - - if (esize == 2) - { - vec_set_all_ones(x86::Xmm(tmp.id())); - vec_xor(esize, x86::Xmm(tmp.id()), x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->emit(utils::has_avx() ? kIdVphminposuw : kIdPhminposuw, x86::Xmm(tmp.id()), x86::Xmm(tmp.id())); - this->emit(utils::has_avx() ? kIdVpextrw : kIdPextrw, dst, x86::Xmm(tmp.id()), Imm(0)); - this->not_(dst.r16()); - } + if (esize == 8 && utils::has_avx()) + this->vmovq(dst.r64(), x86::Xmm(src.id())); + else if (esize == 8) + this->movq(dst.r64(), x86::Xmm(src.id())); + else if (esize == 4 && utils::has_avx()) + this->vmovd(dst.r32(), x86::Xmm(src.id())); else if (esize == 4) - { - if (utils::has_avx()) - { - this->vpsrldq(x86::Xmm(tmp.id()), x86::Xmm(src.id()), 8); - this->vpmaxud(x86::Xmm(tmp.id()), x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->vpsrldq(x86::Xmm(src.id()), x86::Xmm(tmp.id()), 4); - this->vpmaxud(x86::Xmm(src.id()), x86::Xmm(src.id()), x86::Xmm(tmp.id())); - this->vmovd(dst.r32(), x86::Xmm(src.id())); - } - else - { - this->movdqa(x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->psrldq(x86::Xmm(tmp.id()), 8); - this->pmaxud(x86::Xmm(tmp.id()), x86::Xmm(src.id())); - this->movdqa(x86::Xmm(src.id()), x86::Xmm(tmp.id())); - this->psrldq(x86::Xmm(src.id()), 4); - this->pmaxud(x86::Xmm(src.id()), x86::Xmm(tmp.id())); - this->movd(dst.r32(), x86::Xmm(src.id())); - } - } + this->movd(dst.r32(), x86::Xmm(src.id())); + else if (esize == 2 && utils::has_avx()) + this->vpextrw(dst.r32(), x86::Xmm(src.id()), 0); + else if (esize == 2) + this->pextrw(dst.r32(), x86::Xmm(src.id()), 0); else - { fmt::throw_exception("Unimplemented"); - } } + #endif /* X86 */ #ifdef LLVM_AVAILABLE diff --git a/Utilities/JIT.h b/Utilities/JIT.h index 0f616ab6ab..c5bb0b2ed4 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -226,7 +226,7 @@ namespace asmjit void operator()() noexcept; - void _init(bool full); + void _init(uint new_vsize = 0); void vec_cleanup_ret(); void vec_set_all_zeros(const Operand& v); void vec_set_all_ones(const Operand& v); @@ -263,8 +263,8 @@ namespace asmjit void vec_umin(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs); void vec_umax(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs); - void vec_umin_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp); - void vec_umax_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp); + void vec_extract_high(u32 esize, const Operand& dst, const Operand& src); + void vec_extract_gpr(u32 esize, const x86::Gp& dst, const Operand& src); simd_builder& keep_if_not_masked() { @@ -287,7 +287,7 @@ namespace asmjit return *this; } - void build_loop(u32 esize, auto reg_ctr, auto reg_cnt, auto&& build, auto&& reduce) + void build_loop(u32 esize, const x86::Gp& reg_ctr, const x86::Gp& reg_cnt, auto&& build, auto&& reduce) { ensure((esize & (esize - 1)) == 0); ensure(esize <= vsize); @@ -299,47 +299,76 @@ namespace asmjit const u32 step = vsize / esize; this->xor_(reg_ctr.r32(), reg_ctr.r32()); // Reset counter reg - this->sub(reg_cnt, step); + this->cmp(reg_cnt, step); this->jb(next); // If count < step, skip main loop body this->align(AlignMode::kCode, 16); this->bind(body); + this->sub(reg_cnt, step); build(); this->add(reg_ctr, step); - this->sub(reg_cnt, step); - this->ja(body); + this->cmp(reg_cnt, step); + this->jae(body); this->bind(next); - if (!vmask) - reduce(); - this->add(reg_cnt, step); - this->jz(exit); if (vmask) { // Build single last iteration (masked) + this->test(reg_cnt, reg_cnt); + this->jz(exit); this->bzhi(reg_cnt, x86::Mem(consts[~u128()], 0), reg_cnt); this->kmovq(x86::k7, reg_cnt); vmask = 7; build(); vmask = -1; - reduce(); + + // Rollout reduction step + this->bind(exit); + while (true) + { + vsize /= 2; + if (vsize < esize) + break; + this->_init(vsize); + reduce(); + } } else { - // Build tail loop (reduced vector width) - Label body = this->newLabel(); - this->align(AlignMode::kCode, 16); - this->bind(body); - const uint vsz = vsize / step; - this->_init(false); - vsize = vsz; - build(); - this->_init(true); - this->inc(reg_ctr); - this->sub(reg_cnt, 1); - this->ja(body); + // Build unrolled loop tail (reduced vector width) + while (true) + { + vsize /= 2; + if (vsize < esize) + break; + + // Shall not clobber flags + this->_init(vsize); + reduce(); + + if (vsize == esize) + { + // Last "iteration" + this->test(reg_cnt, reg_cnt); + this->jz(exit); + build(); + } + else + { + const u32 step = vsize / esize; + Label next = this->newLabel(); + this->cmp(reg_cnt, step); + this->jb(next); + build(); + this->add(reg_ctr, step); + this->sub(reg_cnt, step); + this->bind(next); + } + } + + this->bind(exit); } - this->bind(exit); + this->_init(0); } }; diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 394a51637e..199976951f 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -193,7 +193,7 @@ namespace { if constexpr (Compare) { - if (c.vsize == 32 && c.vmask == 0) + if (c.vsize == 16 && c.vmask == 0) { // Fix for AVX2 path c.vextracti128(x86::xmm0, x86::ymm2, 1); @@ -280,12 +280,9 @@ namespace return; } - static const v128 all_ones_except_low_element = gv_shuffle_left(v128::from32p(-1)); - c.vec_set_const(c.v1, sizeof(T) == 2 ? s_bswap_u16_mask : s_bswap_u32_mask); c.vec_set_all_ones(c.v2); // vec min c.vec_set_all_zeros(c.v3); // vec max - c.vec_set_const(c.v4, all_ones_except_low_element); c.build_loop(sizeof(T), x86::eax, args[2].r32(), [&] { @@ -310,40 +307,19 @@ namespace } c.keep_if_not_masked().vec_umax(sizeof(T), c.v3, c.v3, c.v0); - - if (c.vsize < 16) - { - // In remaining loop: protect min values - c.vec_or(sizeof(T), c.v5, c.v0, c.v4); - c.vec_umin(sizeof(T), c.v2, c.v2, c.v5); - } - else - { - c.keep_if_not_masked().vec_umin(sizeof(T), c.v2, c.v2, c.v0); - } - + c.keep_if_not_masked().vec_umin(sizeof(T), c.v2, c.v2, c.v0); c.keep_if_not_masked().vec_store_unaligned(sizeof(T), c.v0, c.ptr_scale_for_vec(sizeof(T), args[1], x86::rax)); }, [&] { - // Compress to xmm, protect high values - if (c.vsize >= 64) - { - c.vextracti32x8(x86::ymm0, x86::zmm3, 1); - c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpmaxud : x86::Inst::kIdVpmaxuw, x86::ymm3, x86::ymm3, x86::ymm0); - c.vextracti32x8(x86::ymm0, x86::zmm2, 1); - c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpminud : x86::Inst::kIdVpminuw, x86::ymm2, x86::ymm2, x86::ymm0); - } - if (c.vsize >= 32) - { - c.vextracti128(x86::xmm0, x86::ymm3, 1); - c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpmaxud : x86::Inst::kIdVpmaxuw, x86::xmm3, x86::xmm3, x86::xmm0); - c.vextracti128(x86::xmm0, x86::ymm2, 1); - c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpminud : x86::Inst::kIdVpminuw, x86::xmm2, x86::xmm2, x86::xmm0); - } + // Compress horizontally, protect high values + c.vec_extract_high(sizeof(T), c.v0, c.v3); + c.vec_umax(sizeof(T), c.v3, c.v3, c.v0); + c.vec_extract_high(sizeof(T), c.v0, c.v2); + c.vec_umin(sizeof(T), c.v2, c.v2, c.v0); }); - c.vec_umax_horizontal_i128(sizeof(T), x86::rdx, c.v3, c.v0); - c.vec_umin_horizontal_i128(sizeof(T), x86::rax, c.v2, c.v0); + c.vec_extract_gpr(sizeof(T), x86::edx, c.v3); + c.vec_extract_gpr(sizeof(T), x86::eax, c.v2); c.shl(x86::rdx, 32); c.or_(x86::rax, x86::rdx); c.vec_cleanup_ret();