mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-25 04:02:42 +01:00
BufferUtils: simd_builder refactoring
Some simplifications implemented.
This commit is contained in:
parent
a0d48c588a
commit
11a1f090d3
@ -357,7 +357,7 @@ asmjit::inline_runtime::~inline_runtime()
|
|||||||
asmjit::simd_builder::simd_builder(CodeHolder* ch) noexcept
|
asmjit::simd_builder::simd_builder(CodeHolder* ch) noexcept
|
||||||
: native_asm(ch)
|
: native_asm(ch)
|
||||||
{
|
{
|
||||||
_init(true);
|
_init(0);
|
||||||
consts[~v128()] = this->newLabel();
|
consts[~v128()] = this->newLabel();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -365,9 +365,9 @@ asmjit::simd_builder::~simd_builder()
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void asmjit::simd_builder::_init(bool full)
|
void asmjit::simd_builder::_init(uint new_vsize)
|
||||||
{
|
{
|
||||||
if (full && utils::has_avx512_icl())
|
if ((!new_vsize && utils::has_avx512_icl()) || new_vsize == 64)
|
||||||
{
|
{
|
||||||
v0 = x86::zmm0;
|
v0 = x86::zmm0;
|
||||||
v1 = x86::zmm1;
|
v1 = x86::zmm1;
|
||||||
@ -377,7 +377,7 @@ void asmjit::simd_builder::_init(bool full)
|
|||||||
v5 = x86::zmm5;
|
v5 = x86::zmm5;
|
||||||
vsize = 64;
|
vsize = 64;
|
||||||
}
|
}
|
||||||
else if (full && utils::has_avx2())
|
else if ((!new_vsize && utils::has_avx2()) || new_vsize == 32)
|
||||||
{
|
{
|
||||||
v0 = x86::ymm0;
|
v0 = x86::ymm0;
|
||||||
v1 = x86::ymm1;
|
v1 = x86::ymm1;
|
||||||
@ -395,10 +395,10 @@ void asmjit::simd_builder::_init(bool full)
|
|||||||
v3 = x86::xmm3;
|
v3 = x86::xmm3;
|
||||||
v4 = x86::xmm4;
|
v4 = x86::xmm4;
|
||||||
v5 = x86::xmm5;
|
v5 = x86::xmm5;
|
||||||
vsize = 16;
|
vsize = new_vsize ? new_vsize : 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (full && utils::has_avx512())
|
if (!new_vsize && utils::has_avx512())
|
||||||
{
|
{
|
||||||
vmask = -1;
|
vmask = -1;
|
||||||
}
|
}
|
||||||
@ -480,6 +480,10 @@ void asmjit::simd_builder::vec_clobbering_test(u32 esize, const Operand& v, cons
|
|||||||
{
|
{
|
||||||
this->emit(x86::Inst::kIdVptest, v, rhs);
|
this->emit(x86::Inst::kIdVptest, v, rhs);
|
||||||
}
|
}
|
||||||
|
else if (esize == 16 && utils::has_avx())
|
||||||
|
{
|
||||||
|
this->emit(x86::Inst::kIdVptest, v, rhs);
|
||||||
|
}
|
||||||
else if (esize == 16 && utils::has_sse41())
|
else if (esize == 16 && utils::has_sse41())
|
||||||
{
|
{
|
||||||
this->emit(x86::Inst::kIdPtest, v, rhs);
|
this->emit(x86::Inst::kIdPtest, v, rhs);
|
||||||
@ -636,7 +640,7 @@ void asmjit::simd_builder::_vec_binary_op(x86::Inst::Id sse_op, x86::Inst::Id ve
|
|||||||
{
|
{
|
||||||
if (utils::has_avx())
|
if (utils::has_avx())
|
||||||
{
|
{
|
||||||
if (vex_op == x86::Inst::kIdNone || this->_extraReg.isReg())
|
if (evex_op != x86::Inst::kIdNone && (vex_op == x86::Inst::kIdNone || this->_extraReg.isReg() || vsize >= 64))
|
||||||
{
|
{
|
||||||
this->evex().emit(evex_op, dst, lhs, rhs);
|
this->evex().emit(evex_op, dst, lhs, rhs);
|
||||||
}
|
}
|
||||||
@ -694,92 +698,42 @@ void asmjit::simd_builder::vec_umax(u32 esize, const Operand& dst, const Operand
|
|||||||
fmt::throw_exception("Unimplemented");
|
fmt::throw_exception("Unimplemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
void asmjit::simd_builder::vec_umin_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp)
|
void asmjit::simd_builder::vec_extract_high(u32, const Operand& dst, const Operand& src)
|
||||||
{
|
{
|
||||||
using enum x86::Inst::Id;
|
if (vsize == 32)
|
||||||
if (!utils::has_sse41())
|
this->vextracti32x8(x86::Ymm(dst.id()), x86::Zmm(src.id()), 1);
|
||||||
{
|
else if (vsize == 16)
|
||||||
fmt::throw_exception("Unimplemented");
|
this->vextracti128(x86::Xmm(dst.id()), x86::Ymm(src.id()), 1);
|
||||||
}
|
|
||||||
|
|
||||||
ensure(src != tmp);
|
|
||||||
|
|
||||||
if (esize == 2)
|
|
||||||
{
|
|
||||||
this->emit(utils::has_avx() ? kIdVphminposuw : kIdPhminposuw, x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->emit(utils::has_avx() ? kIdVpextrw : kIdPextrw, dst, x86::Xmm(tmp.id()), Imm(0));
|
|
||||||
}
|
|
||||||
else if (esize == 4)
|
|
||||||
{
|
|
||||||
if (utils::has_avx())
|
|
||||||
{
|
|
||||||
this->vpsrldq(x86::Xmm(tmp.id()), x86::Xmm(src.id()), 8);
|
|
||||||
this->vpminud(x86::Xmm(tmp.id()), x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->vpsrldq(x86::Xmm(src.id()), x86::Xmm(tmp.id()), 4);
|
|
||||||
this->vpminud(x86::Xmm(src.id()), x86::Xmm(src.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->vmovd(dst.r32(), x86::Xmm(src.id()));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
this->movdqa(x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->psrldq(x86::Xmm(tmp.id()), 8);
|
|
||||||
this->pminud(x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->movdqa(x86::Xmm(src.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->psrldq(x86::Xmm(src.id()), 4);
|
|
||||||
this->pminud(x86::Xmm(src.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->movd(dst.r32(), x86::Xmm(src.id()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fmt::throw_exception("Unimplemented");
|
if (utils::has_avx())
|
||||||
|
this->vpsrldq(x86::Xmm(dst.id()), x86::Xmm(src.id()), vsize);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
this->movdqa(x86::Xmm(dst.id()), x86::Xmm(src.id()));
|
||||||
|
this->psrldq(x86::Xmm(dst.id()), vsize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void asmjit::simd_builder::vec_umax_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp)
|
void asmjit::simd_builder::vec_extract_gpr(u32 esize, const x86::Gp& dst, const Operand& src)
|
||||||
{
|
{
|
||||||
using enum x86::Inst::Id;
|
if (esize == 8 && utils::has_avx())
|
||||||
if (!utils::has_sse41())
|
this->vmovq(dst.r64(), x86::Xmm(src.id()));
|
||||||
{
|
else if (esize == 8)
|
||||||
fmt::throw_exception("Unimplemented");
|
this->movq(dst.r64(), x86::Xmm(src.id()));
|
||||||
}
|
else if (esize == 4 && utils::has_avx())
|
||||||
|
this->vmovd(dst.r32(), x86::Xmm(src.id()));
|
||||||
ensure(src != tmp);
|
|
||||||
|
|
||||||
if (esize == 2)
|
|
||||||
{
|
|
||||||
vec_set_all_ones(x86::Xmm(tmp.id()));
|
|
||||||
vec_xor(esize, x86::Xmm(tmp.id()), x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->emit(utils::has_avx() ? kIdVphminposuw : kIdPhminposuw, x86::Xmm(tmp.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->emit(utils::has_avx() ? kIdVpextrw : kIdPextrw, dst, x86::Xmm(tmp.id()), Imm(0));
|
|
||||||
this->not_(dst.r16());
|
|
||||||
}
|
|
||||||
else if (esize == 4)
|
else if (esize == 4)
|
||||||
{
|
this->movd(dst.r32(), x86::Xmm(src.id()));
|
||||||
if (utils::has_avx())
|
else if (esize == 2 && utils::has_avx())
|
||||||
{
|
this->vpextrw(dst.r32(), x86::Xmm(src.id()), 0);
|
||||||
this->vpsrldq(x86::Xmm(tmp.id()), x86::Xmm(src.id()), 8);
|
else if (esize == 2)
|
||||||
this->vpmaxud(x86::Xmm(tmp.id()), x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
this->pextrw(dst.r32(), x86::Xmm(src.id()), 0);
|
||||||
this->vpsrldq(x86::Xmm(src.id()), x86::Xmm(tmp.id()), 4);
|
|
||||||
this->vpmaxud(x86::Xmm(src.id()), x86::Xmm(src.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->vmovd(dst.r32(), x86::Xmm(src.id()));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
this->movdqa(x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->psrldq(x86::Xmm(tmp.id()), 8);
|
|
||||||
this->pmaxud(x86::Xmm(tmp.id()), x86::Xmm(src.id()));
|
|
||||||
this->movdqa(x86::Xmm(src.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->psrldq(x86::Xmm(src.id()), 4);
|
|
||||||
this->pmaxud(x86::Xmm(src.id()), x86::Xmm(tmp.id()));
|
|
||||||
this->movd(dst.r32(), x86::Xmm(src.id()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
|
||||||
fmt::throw_exception("Unimplemented");
|
fmt::throw_exception("Unimplemented");
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* X86 */
|
#endif /* X86 */
|
||||||
|
|
||||||
#ifdef LLVM_AVAILABLE
|
#ifdef LLVM_AVAILABLE
|
||||||
|
@ -226,7 +226,7 @@ namespace asmjit
|
|||||||
|
|
||||||
void operator()() noexcept;
|
void operator()() noexcept;
|
||||||
|
|
||||||
void _init(bool full);
|
void _init(uint new_vsize = 0);
|
||||||
void vec_cleanup_ret();
|
void vec_cleanup_ret();
|
||||||
void vec_set_all_zeros(const Operand& v);
|
void vec_set_all_zeros(const Operand& v);
|
||||||
void vec_set_all_ones(const Operand& v);
|
void vec_set_all_ones(const Operand& v);
|
||||||
@ -263,8 +263,8 @@ namespace asmjit
|
|||||||
void vec_umin(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs);
|
void vec_umin(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs);
|
||||||
void vec_umax(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs);
|
void vec_umax(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs);
|
||||||
|
|
||||||
void vec_umin_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp);
|
void vec_extract_high(u32 esize, const Operand& dst, const Operand& src);
|
||||||
void vec_umax_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp);
|
void vec_extract_gpr(u32 esize, const x86::Gp& dst, const Operand& src);
|
||||||
|
|
||||||
simd_builder& keep_if_not_masked()
|
simd_builder& keep_if_not_masked()
|
||||||
{
|
{
|
||||||
@ -287,7 +287,7 @@ namespace asmjit
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
void build_loop(u32 esize, auto reg_ctr, auto reg_cnt, auto&& build, auto&& reduce)
|
void build_loop(u32 esize, const x86::Gp& reg_ctr, const x86::Gp& reg_cnt, auto&& build, auto&& reduce)
|
||||||
{
|
{
|
||||||
ensure((esize & (esize - 1)) == 0);
|
ensure((esize & (esize - 1)) == 0);
|
||||||
ensure(esize <= vsize);
|
ensure(esize <= vsize);
|
||||||
@ -299,47 +299,76 @@ namespace asmjit
|
|||||||
const u32 step = vsize / esize;
|
const u32 step = vsize / esize;
|
||||||
|
|
||||||
this->xor_(reg_ctr.r32(), reg_ctr.r32()); // Reset counter reg
|
this->xor_(reg_ctr.r32(), reg_ctr.r32()); // Reset counter reg
|
||||||
this->sub(reg_cnt, step);
|
this->cmp(reg_cnt, step);
|
||||||
this->jb(next); // If count < step, skip main loop body
|
this->jb(next); // If count < step, skip main loop body
|
||||||
this->align(AlignMode::kCode, 16);
|
this->align(AlignMode::kCode, 16);
|
||||||
this->bind(body);
|
this->bind(body);
|
||||||
|
this->sub(reg_cnt, step);
|
||||||
build();
|
build();
|
||||||
this->add(reg_ctr, step);
|
this->add(reg_ctr, step);
|
||||||
this->sub(reg_cnt, step);
|
this->cmp(reg_cnt, step);
|
||||||
this->ja(body);
|
this->jae(body);
|
||||||
this->bind(next);
|
this->bind(next);
|
||||||
if (!vmask)
|
|
||||||
reduce();
|
|
||||||
this->add(reg_cnt, step);
|
|
||||||
this->jz(exit);
|
|
||||||
|
|
||||||
if (vmask)
|
if (vmask)
|
||||||
{
|
{
|
||||||
// Build single last iteration (masked)
|
// Build single last iteration (masked)
|
||||||
|
this->test(reg_cnt, reg_cnt);
|
||||||
|
this->jz(exit);
|
||||||
this->bzhi(reg_cnt, x86::Mem(consts[~u128()], 0), reg_cnt);
|
this->bzhi(reg_cnt, x86::Mem(consts[~u128()], 0), reg_cnt);
|
||||||
this->kmovq(x86::k7, reg_cnt);
|
this->kmovq(x86::k7, reg_cnt);
|
||||||
vmask = 7;
|
vmask = 7;
|
||||||
build();
|
build();
|
||||||
vmask = -1;
|
vmask = -1;
|
||||||
reduce();
|
|
||||||
|
// Rollout reduction step
|
||||||
|
this->bind(exit);
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
vsize /= 2;
|
||||||
|
if (vsize < esize)
|
||||||
|
break;
|
||||||
|
this->_init(vsize);
|
||||||
|
reduce();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Build tail loop (reduced vector width)
|
// Build unrolled loop tail (reduced vector width)
|
||||||
Label body = this->newLabel();
|
while (true)
|
||||||
this->align(AlignMode::kCode, 16);
|
{
|
||||||
this->bind(body);
|
vsize /= 2;
|
||||||
const uint vsz = vsize / step;
|
if (vsize < esize)
|
||||||
this->_init(false);
|
break;
|
||||||
vsize = vsz;
|
|
||||||
build();
|
// Shall not clobber flags
|
||||||
this->_init(true);
|
this->_init(vsize);
|
||||||
this->inc(reg_ctr);
|
reduce();
|
||||||
this->sub(reg_cnt, 1);
|
|
||||||
this->ja(body);
|
if (vsize == esize)
|
||||||
|
{
|
||||||
|
// Last "iteration"
|
||||||
|
this->test(reg_cnt, reg_cnt);
|
||||||
|
this->jz(exit);
|
||||||
|
build();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const u32 step = vsize / esize;
|
||||||
|
Label next = this->newLabel();
|
||||||
|
this->cmp(reg_cnt, step);
|
||||||
|
this->jb(next);
|
||||||
|
build();
|
||||||
|
this->add(reg_ctr, step);
|
||||||
|
this->sub(reg_cnt, step);
|
||||||
|
this->bind(next);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this->bind(exit);
|
||||||
}
|
}
|
||||||
|
|
||||||
this->bind(exit);
|
this->_init(0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -193,7 +193,7 @@ namespace
|
|||||||
{
|
{
|
||||||
if constexpr (Compare)
|
if constexpr (Compare)
|
||||||
{
|
{
|
||||||
if (c.vsize == 32 && c.vmask == 0)
|
if (c.vsize == 16 && c.vmask == 0)
|
||||||
{
|
{
|
||||||
// Fix for AVX2 path
|
// Fix for AVX2 path
|
||||||
c.vextracti128(x86::xmm0, x86::ymm2, 1);
|
c.vextracti128(x86::xmm0, x86::ymm2, 1);
|
||||||
@ -280,12 +280,9 @@ namespace
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const v128 all_ones_except_low_element = gv_shuffle_left<sizeof(T)>(v128::from32p(-1));
|
|
||||||
|
|
||||||
c.vec_set_const(c.v1, sizeof(T) == 2 ? s_bswap_u16_mask : s_bswap_u32_mask);
|
c.vec_set_const(c.v1, sizeof(T) == 2 ? s_bswap_u16_mask : s_bswap_u32_mask);
|
||||||
c.vec_set_all_ones(c.v2); // vec min
|
c.vec_set_all_ones(c.v2); // vec min
|
||||||
c.vec_set_all_zeros(c.v3); // vec max
|
c.vec_set_all_zeros(c.v3); // vec max
|
||||||
c.vec_set_const(c.v4, all_ones_except_low_element);
|
|
||||||
|
|
||||||
c.build_loop(sizeof(T), x86::eax, args[2].r32(), [&]
|
c.build_loop(sizeof(T), x86::eax, args[2].r32(), [&]
|
||||||
{
|
{
|
||||||
@ -310,40 +307,19 @@ namespace
|
|||||||
}
|
}
|
||||||
|
|
||||||
c.keep_if_not_masked().vec_umax(sizeof(T), c.v3, c.v3, c.v0);
|
c.keep_if_not_masked().vec_umax(sizeof(T), c.v3, c.v3, c.v0);
|
||||||
|
c.keep_if_not_masked().vec_umin(sizeof(T), c.v2, c.v2, c.v0);
|
||||||
if (c.vsize < 16)
|
|
||||||
{
|
|
||||||
// In remaining loop: protect min values
|
|
||||||
c.vec_or(sizeof(T), c.v5, c.v0, c.v4);
|
|
||||||
c.vec_umin(sizeof(T), c.v2, c.v2, c.v5);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c.keep_if_not_masked().vec_umin(sizeof(T), c.v2, c.v2, c.v0);
|
|
||||||
}
|
|
||||||
|
|
||||||
c.keep_if_not_masked().vec_store_unaligned(sizeof(T), c.v0, c.ptr_scale_for_vec(sizeof(T), args[1], x86::rax));
|
c.keep_if_not_masked().vec_store_unaligned(sizeof(T), c.v0, c.ptr_scale_for_vec(sizeof(T), args[1], x86::rax));
|
||||||
}, [&]
|
}, [&]
|
||||||
{
|
{
|
||||||
// Compress to xmm, protect high values
|
// Compress horizontally, protect high values
|
||||||
if (c.vsize >= 64)
|
c.vec_extract_high(sizeof(T), c.v0, c.v3);
|
||||||
{
|
c.vec_umax(sizeof(T), c.v3, c.v3, c.v0);
|
||||||
c.vextracti32x8(x86::ymm0, x86::zmm3, 1);
|
c.vec_extract_high(sizeof(T), c.v0, c.v2);
|
||||||
c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpmaxud : x86::Inst::kIdVpmaxuw, x86::ymm3, x86::ymm3, x86::ymm0);
|
c.vec_umin(sizeof(T), c.v2, c.v2, c.v0);
|
||||||
c.vextracti32x8(x86::ymm0, x86::zmm2, 1);
|
|
||||||
c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpminud : x86::Inst::kIdVpminuw, x86::ymm2, x86::ymm2, x86::ymm0);
|
|
||||||
}
|
|
||||||
if (c.vsize >= 32)
|
|
||||||
{
|
|
||||||
c.vextracti128(x86::xmm0, x86::ymm3, 1);
|
|
||||||
c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpmaxud : x86::Inst::kIdVpmaxuw, x86::xmm3, x86::xmm3, x86::xmm0);
|
|
||||||
c.vextracti128(x86::xmm0, x86::ymm2, 1);
|
|
||||||
c.emit(sizeof(T) == 4 ? x86::Inst::kIdVpminud : x86::Inst::kIdVpminuw, x86::xmm2, x86::xmm2, x86::xmm0);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
c.vec_umax_horizontal_i128(sizeof(T), x86::rdx, c.v3, c.v0);
|
c.vec_extract_gpr(sizeof(T), x86::edx, c.v3);
|
||||||
c.vec_umin_horizontal_i128(sizeof(T), x86::rax, c.v2, c.v0);
|
c.vec_extract_gpr(sizeof(T), x86::eax, c.v2);
|
||||||
c.shl(x86::rdx, 32);
|
c.shl(x86::rdx, 32);
|
||||||
c.or_(x86::rax, x86::rdx);
|
c.or_(x86::rax, x86::rdx);
|
||||||
c.vec_cleanup_ret();
|
c.vec_cleanup_ret();
|
||||||
|
Loading…
Reference in New Issue
Block a user