1
0
mirror of https://github.com/RPCS3/rpcs3.git synced 2024-11-22 18:53:28 +01:00

Implement independent CPU preemptions

This commit is contained in:
Eladash 2022-09-06 18:59:23 +03:00 committed by kd-11
parent b9e20dc5c9
commit ec7b18dab5
12 changed files with 358 additions and 4 deletions

View File

@ -55,6 +55,8 @@ void fmt_class_string<cpu_flag>::format(std::string& out, u64 arg)
case cpu_flag::memory: return "mem";
case cpu_flag::pending: return "pend";
case cpu_flag::pending_recheck: return "pend-re";
case cpu_flag::yield: return "y";
case cpu_flag::preempt: return "PREEMPT";
case cpu_flag::dbg_global_pause: return "G-PAUSE";
case cpu_flag::dbg_pause: return "PAUSE";
case cpu_flag::dbg_step: return "STEP";
@ -575,6 +577,7 @@ void cpu_thread::operator()()
if (!(state0 & cpu_flag::stop))
{
cpu_task();
state += cpu_flag::wait;
if (state & cpu_flag::ret && state.test_and_reset(cpu_flag::ret))
{
@ -731,7 +734,7 @@ bool cpu_thread::check_state() noexcept
if (!is_stopped(flags) && flags.none_of(cpu_flag::ret))
{
// Check pause flags which hold thread inside check_state (ignore suspend/debug flags on cpu_flag::temp)
if (flags & (cpu_flag::pause + cpu_flag::memory) || (cpu_can_stop && flags & (cpu_flag::dbg_global_pause + cpu_flag::dbg_pause + cpu_flag::suspend)))
if (flags & (cpu_flag::pause + cpu_flag::memory + cpu_flag::yield + cpu_flag::preempt) || (cpu_can_stop && flags & (cpu_flag::dbg_global_pause + cpu_flag::dbg_pause + cpu_flag::suspend)))
{
if (!(flags & cpu_flag::wait))
{
@ -739,6 +742,12 @@ bool cpu_thread::check_state() noexcept
store = true;
}
if (flags & (cpu_flag::yield + cpu_flag::preempt))
{
flags -= (cpu_flag::yield + cpu_flag::preempt);
store = true;
}
escape = false;
state1 = flags;
return store;
@ -768,6 +777,30 @@ bool cpu_thread::check_state() noexcept
return store;
}).first;
if (state0 & cpu_flag::preempt)
{
if (cpu_flag::wait - state0)
{
// Yield itself
state.wait(state1, atomic_wait_timeout{20'000});
}
if (const u128 bits = s_cpu_bits)
{
reader_lock lock(s_cpu_lock);
cpu_counter::for_all_cpu(bits & s_cpu_bits, [](cpu_thread* cpu)
{
if (cpu->state.none_of(cpu_flag::wait + cpu_flag::yield))
{
cpu->state += cpu_flag::yield;
}
return true;
});
}
}
if (escape)
{
if (s_tls_thread_slot == umax && !retval)
@ -856,6 +889,14 @@ bool cpu_thread::check_state() noexcept
break;
}
}
continue;
}
if (state0 & cpu_flag::yield && cpu_flag::wait - state0)
{
// Short sleep when yield flag is present alone (makes no sense when other methods which can stop thread execution have been done)
state.wait(state1, atomic_wait_timeout{20'000});
}
}
}

View File

@ -25,6 +25,8 @@ enum class cpu_flag : u32
pending, // Thread has postponed work
pending_recheck, // Thread needs to recheck if there is pending work before ::pending removal
notify, // Flag meant solely to allow atomic notification on state without changing other flags
yield, // Thread is being requested to yield its execution time if it's running
preempt, // Thread is being requested to preempt the execution of all CPU threads
dbg_global_pause, // Emulation paused
dbg_pause, // Thread paused

View File

@ -4247,7 +4247,7 @@ s64 spu_thread::get_ch_value(u32 ch)
spu_function_logger logger(*this, "MFC Events read");
state += cpu_flag::wait;
lv2_obj::prepare_for_sleep(*this);
using resrv_ptr = std::add_pointer_t<const decltype(rdata)>;
@ -4943,7 +4943,7 @@ bool spu_thread::stop_and_signal(u32 code)
return ch_in_mbox.set_values(1, CELL_EINVAL), true;
}
state += cpu_flag::wait;
lv2_obj::prepare_for_sleep(*this);
spu_function_logger logger(*this, "sys_spu_thread_receive_event");

View File

@ -51,9 +51,15 @@
#include <optional>
#include <deque>
#include "util/tsc.hpp"
extern std::string ppu_get_syscall_name(u64 code);
namespace rsx
{
void set_rsx_yield_flag() noexcept;
}
template <>
void fmt_class_string<ppu_syscall_code>::format(std::string& out, u64 arg)
{
@ -1202,6 +1208,10 @@ static std::deque<std::pair<u64, class cpu_thread*>> g_waiting;
// Threads which must call lv2_obj::sleep before the scheduler starts
static std::deque<class cpu_thread*> g_to_sleep;
static atomic_t<u64> s_yield_frequency = 0;
static atomic_t<u64> s_max_allowed_yield_tsc = 0;
static u64 s_last_yield_tsc = 0;
namespace cpu_counter
{
void remove(cpu_thread*) noexcept;
@ -1577,6 +1587,7 @@ void lv2_obj::cleanup()
g_to_sleep.clear();
g_waiting.clear();
g_pending = 0;
s_yield_frequency = 0;
}
void lv2_obj::schedule_all(u64 current_time)
@ -1653,6 +1664,22 @@ void lv2_obj::schedule_all(u64 current_time)
// Null-terminate the list if it ends before last slot
g_to_notify[notify_later_idx] = nullptr;
}
if (const u64 freq = s_yield_frequency)
{
if (auto cpu = cpu_thread::get_current())
{
const u64 tsc = utils::get_tsc();
const u64 last_tsc = s_last_yield_tsc;
if (tsc >= last_tsc && tsc <= s_max_allowed_yield_tsc && tsc - last_tsc >= freq)
{
cpu->state += cpu_flag::preempt;
s_last_yield_tsc = tsc;
rsx::set_rsx_yield_flag();
}
}
}
}
ppu_thread_status lv2_obj::ppu_state(ppu_thread* ppu, bool lock_idm, bool lock_lv2)
@ -1737,6 +1764,12 @@ bool lv2_obj::has_ppus_in_running_state()
return false;
}
void lv2_obj::set_yield_frequency(u64 freq, u64 max_allowed_tsc)
{
s_yield_frequency.release(freq);
s_max_allowed_yield_tsc.release(max_allowed_tsc);
}
bool lv2_obj::wait_timeout(u64 usec, ppu_thread* cpu, bool scale, bool is_usleep)
{
static_assert(u64{umax} / max_timeout >= 100, "max timeout is not valid for scaling");

View File

@ -266,6 +266,8 @@ public:
// Must be called under IDM lock
static bool has_ppus_in_running_state();
static void set_yield_frequency(u64 freq, u64 max_allowed_tsx);
static void cleanup();
template <typename T>

View File

@ -2,6 +2,7 @@
#include "RSXThread.h"
#include "Emu/Cell/PPUCallback.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/timers.hpp"
#include "Common/BufferUtils.h"
@ -42,6 +43,8 @@ rsx::frame_capture_data frame_capture;
extern CellGcmOffsetTable offsetTable;
extern thread_local std::string(*g_tls_log_prefix)();
LOG_CHANNEL(perf_log, "PERF");
template <>
bool serialize<rsx::rsx_state>(utils::serial& ar, rsx::rsx_state& o)
{
@ -241,6 +244,17 @@ namespace rsx
fmt::throw_exception("rsx::get_address(offset=0x%x, location=0x%x): %s%s", offset, location, msg, src_loc{line, col, file, func});
}
extern void set_rsx_yield_flag() noexcept
{
if (auto rsx = get_current_renderer())
{
if (g_cfg.core.allow_rsx_cpu_preempt)
{
rsx->state += cpu_flag::yield;
}
}
}
std::pair<u32, u32> interleaved_range_info::calculate_required_range(u32 first, u32 count) const
{
if (single_vertex)
@ -3373,7 +3387,7 @@ namespace rsx
switch (frame_limit)
{
case frame_limit_type::none: limit = 0.; break;
case frame_limit_type::none: limit = g_cfg.core.max_cpu_preempt_count_per_frame ? static_cast<double>(g_cfg.video.vblank_rate) : 0.; break;
case frame_limit_type::_50: limit = 50.; break;
case frame_limit_type::_60: limit = 60.; break;
case frame_limit_type::_30: limit = 30.; break;
@ -3478,5 +3492,183 @@ namespace rsx
intr_thread->cmd_notify.notify_one();
}
}
evaluate_cpu_usage_reduction_limits();
}
void thread::evaluate_cpu_usage_reduction_limits()
{
const u64 max_preempt_count = g_cfg.core.max_cpu_preempt_count_per_frame;
if (!max_preempt_count)
{
frame_times.clear();
lv2_obj::set_yield_frequency(0, 0);
return;
}
const u64 current_time = get_system_time();
const u64 current_tsc = utils::get_tsc();
u64 preempt_count = 0;
if (frame_times.size() >= 60)
{
u64 diffs = 0;
for (usz i = 1; i < frame_times.size(); i++)
{
const u64 cur_diff = frame_times[i].timestamp - frame_times[i - 1].timestamp;
diffs += cur_diff;
}
const usz avg_frame_time = diffs / 59;
u32 lowered_delay = 0;
u32 highered_delay = 0;
bool can_reevaluate = true;
u64 prev_preempt_count = umax;
for (usz i = frame_times.size() - 30; i < frame_times.size(); i++)
{
if (prev_preempt_count == umax)
{
prev_preempt_count = frame_times[i].preempt_count;
continue;
}
if (prev_preempt_count != frame_times[i].preempt_count)
{
if (prev_preempt_count > frame_times[i].preempt_count)
{
lowered_delay++;
}
else if (prev_preempt_count < frame_times[i].preempt_count)
{
highered_delay++;
}
if (i > frame_times.size() - 30)
{
// Slow preemption count increase
can_reevaluate = false;
}
}
prev_preempt_count = frame_times[i].preempt_count;
}
preempt_count = frame_times.back().preempt_count;
u32 fails = 0;
u32 hard_fails = 0;
bool is_last_frame_a_fail = false;
auto abs_dst = [](u64 a, u64 b)
{
return a >= b ? a - b : b - a;
};
for (u32 i = 1; i <= frame_times.size(); i++)
{
const u64 cur_diff = (i == frame_times.size() ? current_time : frame_times[i].timestamp) - frame_times[i - 1].timestamp;
if (const u64 diff_of_diff = abs_dst(cur_diff, avg_frame_time);
diff_of_diff >= avg_frame_time / 4)
{
if (diff_of_diff >= avg_frame_time / 2)
{
highered_delay++;
hard_fails++;
if (i == frame_times.size())
{
is_last_frame_a_fail = true;
}
}
if (fails != umax)
{
fails++;
}
}
}
bool hard_measures_taken = false;
const usz fps_10 = 10'000'000 / avg_frame_time;
auto lower_preemption_count = [&]()
{
if (preempt_count >= 10)
{
preempt_count -= 10;
}
else
{
preempt_count = 0;
}
if (hard_fails > 2 && is_last_frame_a_fail)
{
hard_measures_taken = preempt_count > 1;
preempt_count = preempt_count * 7 / 8;
prevent_preempt_increase_tickets = 10;
}
else
{
prevent_preempt_increase_tickets = std::max<u32>(7, prevent_preempt_increase_tickets);
}
};
if (can_reevaluate)
{
const bool is_avg_fps_ok = (abs_dst(fps_10, 300) < 3 || abs_dst(fps_10, 600) < 4 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10) < 4 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10 / 2) < 3);
if (!hard_fails && fails < 6 && is_avg_fps_ok)
{
if (prevent_preempt_increase_tickets)
{
prevent_preempt_increase_tickets--;
}
else if (preempt_count < max_preempt_count)
{
preempt_count += 4;
}
}
else
{
lower_preemption_count();
}
}
// Sudden FPS drop detection
else if ((fails > 10 || hard_fails > 2 || !(abs_dst(fps_10, 300) < 20 || abs_dst(fps_10, 600) < 30 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10) < 20 || abs_dst(fps_10, g_cfg.video.vblank_rate * 10 / 2) < 30)) && lowered_delay < highered_delay && is_last_frame_a_fail)
{
lower_preemption_count();
}
perf_log.trace("CPU preemption control: reeval=%d, preempt_count=%d, fails=%d, hard=%d, avg_frame_time=%d, highered=%d, lowered=%d", can_reevaluate, preempt_count, fails, hard_fails, avg_frame_time, highered_delay, lowered_delay);
if (hard_measures_taken)
{
preempt_fail_old_preempt_count = std::max<u32>(preempt_fail_old_preempt_count, frame_times.back().preempt_count);
}
else if (preempt_fail_old_preempt_count)
{
perf_log.error("Lowering current preemption count significantly due to a performance drop, if this issue persists frequantly consider lowering max preemptions count to 'new-count' or lower. (old-count=%d, new-count=%d)", preempt_fail_old_preempt_count, preempt_count);
preempt_fail_old_preempt_count = 0;
}
const u64 tsc_diff = (current_tsc - frame_times.back().tsc);
// Set an upper limit so a backoff technique would be taken if there is a sudden performance drop
// Allow 6% of no yield to reduce significantly the risk of stutter
lv2_obj::set_yield_frequency(preempt_count ? tsc_diff / preempt_count : 0, current_tsc + (tsc_diff * 94 / 100));
frame_times.pop_front();
}
else
{
lv2_obj::set_yield_frequency(0, 0);
}
frame_times.push_back(frame_time_t{preempt_count, current_time, current_tsc});
}
}

View File

@ -689,6 +689,17 @@ namespace rsx
std::queue<desync_fifo_cmd_info> recovered_fifo_cmds_history;
struct frame_time_t
{
u64 preempt_count;
u64 timestamp;
u64 tsc;
};
std::deque<frame_time_t> frame_times;
u32 prevent_preempt_increase_tickets = 0;
u32 preempt_fail_old_preempt_count = 0;
atomic_t<s32> async_tasks_pending{ 0 };
reports::conditional_render_eval cond_render_ctrl;
@ -793,6 +804,7 @@ namespace rsx
shared_mutex m_mtx_task;
void handle_emu_flip(u32 buffer);
void evaluate_cpu_usage_reduction_limits();
void handle_invalidated_memory_range();
public:

View File

@ -91,6 +91,8 @@ struct cfg_root : cfg::node
cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100 }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
cfg::uint<0, 3000> spu_wakeup_delay{ this, "SPU Wake-Up Delay", 0, true };
cfg::uint<0, (1 << 6) - 1> spu_wakeup_delay_mask{ this, "SPU Wake-Up Delay Thread Mask", (1 << 6) - 1, true };
cfg::uint<0, 300> max_cpu_preempt_count_per_frame{ this, "Max CPU Preempt Count", 0, true };
cfg::_bool allow_rsx_cpu_preempt{ this, "Allow RSX CPU Preemptions", true, true };
#if defined (__linux__) || defined (__APPLE__)
cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy", sleep_timers_accuracy_level::_as_host, true };
#else

View File

@ -47,6 +47,7 @@ enum class emu_settings_type
FixupPPUVNAN,
AccuratePPUVNAN,
AccuratePPUFPCC,
MaxPreemptCount,
// Graphics
Renderer,
@ -221,6 +222,7 @@ inline static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::FixupPPUVNAN, { "Core", "PPU Fixup Vector NaN Values"}},
{ emu_settings_type::AccuratePPUVNAN, { "Core", "PPU Accurate Vector NaN Values"}},
{ emu_settings_type::AccuratePPUFPCC, { "Core", "PPU Set FPCC Bits"}},
{ emu_settings_type::MaxPreemptCount, { "Core", "Max CPU Preempt Count"}},
// Graphics Tab
{ emu_settings_type::Renderer, { "Video", "Renderer"}},

View File

@ -1409,6 +1409,24 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
ui->clockScale->setValue(clocks_scale_def);
});
EnhanceSlider(emu_settings_type::MaxPreemptCount, ui->maxPreemptCount, ui->preemptText, tr(reinterpret_cast<const char*>(u8"%0"), "Max CPU preempt count"));
SubscribeTooltip(ui->gb_max_preempt_count, tooltips.settings.max_cpu_preempt);
#ifdef _WIN32
// Windows' thread execution slice is much larger than on other platforms
SnapSlider(ui->maxPreemptCount, 5);
ui->maxPreemptCount->setPageStep(20);
#else
SnapSlider(ui->maxPreemptCount, 10);
ui->maxPreemptCount->setPageStep(50);
#endif
const int preempt_def = stoi(m_emu_settings->GetSettingDefault(emu_settings_type::MaxPreemptCount));
connect(ui->preemptReset, &QAbstractButton::clicked, [preempt_def, this]()
{
ui->maxPreemptCount->setValue(preempt_def);
});
if (!game) // Prevent users from doing dumb things
{
ui->gb_vblank->setDisabled(true);

View File

@ -193,6 +193,55 @@
</layout>
</widget>
</item>
<item>
<widget class="QGroupBox" name="gb_max_preempt_count">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Minimum">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="title">
<string>Max Power Saving CPU-preemptions</string>
</property>
<layout class="QVBoxLayout" name="gb_max_preempt_layout">
<item>
<widget class="QSlider" name="maxPreemptCount">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Minimum">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
</widget>
</item>
<item>
<layout class="QHBoxLayout" name="maxPreemptLayout" stretch="0,0">
<item>
<widget class="QLabel" name="preemptText">
<property name="text">
<string>0</string>
</property>
<property name="alignment">
<set>Qt::AlignCenter</set>
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="preemptReset">
<property name="text">
<string>Reset</string>
</property>
</widget>
</item>
</layout>
</item>
</layout>
</widget>
</item>
<item>
<spacer name="coreTabMiddleLayoutSpacer">
<property name="orientation">

View File

@ -85,6 +85,7 @@ public:
const QString fixup_ppuvnan = tr("Fixup NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
const QString accurate_ppuvnan = tr("Accurately set NaN results in vector instructions in PPU backends.\nIf unsure, do not modify this setting.");
const QString accurate_ppufpcc = tr("Accurately set FPCC Bits in PPU backends.\nIf unsure, do not modify this setting.");
const QString max_cpu_preempt = tr("Reduces CPU usage and power consumption, on mobile devices improves battery life. (0 means disabled)\nHigher values cause a more pronounced effect, but may cause audio or performance issues. A value of 50 or less is recommended.\nThis option forces an FPS limit because it's active when framerate is stable.\nThe lighter the game is on the hardware, the more power is saved by it. (until the preemption count barrier is reached)");
// debug