Skip to content

Commit

Permalink
Create a precise_delay_usec function to avoid excessive CPU penalty w…
Browse files Browse the repository at this point in the history
…hen not intended
  • Loading branch information
mrsaturnsan committed Nov 25, 2024
1 parent 0c45ace commit 41b6414
Show file tree
Hide file tree
Showing 10 changed files with 118 additions and 29 deletions.
16 changes: 16 additions & 0 deletions core/core_bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,13 @@ void OS::delay_usec(int p_usec) const {
::OS::get_singleton()->delay_usec(p_usec);
}

void OS::precise_delay_usec(int p_usec) const {
ERR_FAIL_COND_MSG(
p_usec < 0,
vformat("Can't sleep for %d microseconds. The delay provided must be greater than or equal to 0 microseconds.", p_usec));
::OS::get_singleton()->precise_delay_usec(p_usec);
}

/** This method uses a signed argument for better error reporting as it's used from the scripting API. */
void OS::delay_msec(int p_msec) const {
ERR_FAIL_COND_MSG(
Expand All @@ -538,6 +545,13 @@ void OS::delay_msec(int p_msec) const {
::OS::get_singleton()->delay_usec(int64_t(p_msec) * 1000);
}

void OS::precise_delay_msec(int p_msec) const {
ERR_FAIL_COND_MSG(
p_msec < 0,
vformat("Can't sleep for %d milliseconds. The delay provided must be greater than or equal to 0 milliseconds.", p_msec));
::OS::get_singleton()->precise_delay_usec(int64_t(p_msec) * 1000);
}

bool OS::is_userfs_persistent() const {
return ::OS::get_singleton()->is_userfs_persistent();
}
Expand Down Expand Up @@ -685,7 +699,9 @@ void OS::_bind_methods() {
ClassDB::bind_method(D_METHOD("get_restart_on_exit_arguments"), &OS::get_restart_on_exit_arguments);

ClassDB::bind_method(D_METHOD("delay_usec", "usec"), &OS::delay_usec);
ClassDB::bind_method(D_METHOD("precise_delay_usec", "usec"), &OS::precise_delay_usec);
ClassDB::bind_method(D_METHOD("delay_msec", "msec"), &OS::delay_msec);
ClassDB::bind_method(D_METHOD("precise_delay_msec", "msec"), &OS::precise_delay_msec);
ClassDB::bind_method(D_METHOD("get_locale"), &OS::get_locale);
ClassDB::bind_method(D_METHOD("get_locale_language"), &OS::get_locale_language);
ClassDB::bind_method(D_METHOD("get_model_name"), &OS::get_model_name);
Expand Down
2 changes: 2 additions & 0 deletions core/core_bind.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,9 @@ class OS : public Object {
Dictionary get_memory_info() const;

void delay_usec(int p_usec) const;
void precise_delay_usec(int p_usec) const;
void delay_msec(int p_msec) const;
void precise_delay_msec(int p_msec) const;
uint64_t get_ticks_msec() const;
uint64_t get_ticks_usec() const;

Expand Down
4 changes: 2 additions & 2 deletions core/os/os.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ void OS::add_frame_delay(bool p_can_draw) {
// the actual frame time into account.
// Due to the high fluctuation of the actual sleep duration, it's not recommended
// to use this as a FPS limiter.
delay_usec(frame_delay * 1000);
precise_delay_usec(frame_delay * 1000);
}

// Add a dynamic frame delay to decrease CPU/GPU usage. This takes the
Expand All @@ -616,7 +616,7 @@ void OS::add_frame_delay(bool p_can_draw) {
uint64_t current_ticks = get_ticks_usec();

if (current_ticks < target_ticks) {
delay_usec(target_ticks - current_ticks);
precise_delay_usec(target_ticks - current_ticks);
}

current_ticks = get_ticks_usec();
Expand Down
1 change: 1 addition & 0 deletions core/os/os.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ class OS {
virtual double get_unix_time() const;

virtual void delay_usec(uint32_t p_usec) const = 0;
virtual void precise_delay_usec(uint32_t p_usec) const = 0;
virtual void add_frame_delay(bool p_can_draw);

virtual uint64_t get_ticks_usec() const = 0;
Expand Down
44 changes: 22 additions & 22 deletions core/os/spin_lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,28 @@
#include <intrin.h>
#endif

_ALWAYS_INLINE_ static void _cpu_pause() {
#if defined(_MSC_VER)
// ----- MSVC.
#if defined(_M_ARM) || defined(_M_ARM64) // ARM.
__yield();
#elif defined(_M_IX86) || defined(_M_X64) // x86.
_mm_pause();
#endif
#elif defined(__GNUC__) || defined(__clang__)
// ----- GCC/Clang.
#if defined(__i386__) || defined(__x86_64__) // x86.
__builtin_ia32_pause();
#elif defined(__arm__) || defined(__aarch64__) // ARM.
asm volatile("yield");
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) // PowerPC.
asm volatile("or 27,27,27");
#elif defined(__riscv) // RISC-V.
asm volatile(".insn i 0x0F, 0, x0, x0, 0x010");
#endif
#endif
}

#if defined(__APPLE__)

#include <os/lock.h>
Expand All @@ -60,28 +82,6 @@ class SpinLock {

#include <atomic>

_ALWAYS_INLINE_ static void _cpu_pause() {
#if defined(_MSC_VER)
// ----- MSVC.
#if defined(_M_ARM) || defined(_M_ARM64) // ARM.
__yield();
#elif defined(_M_IX86) || defined(_M_X64) // x86.
_mm_pause();
#endif
#elif defined(__GNUC__) || defined(__clang__)
// ----- GCC/Clang.
#if defined(__i386__) || defined(__x86_64__) // x86.
__builtin_ia32_pause();
#elif defined(__arm__) || defined(__aarch64__) // ARM.
asm volatile("yield");
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) // PowerPC.
asm volatile("or 27,27,27");
#elif defined(__riscv) // RISC-V.
asm volatile(".insn i 0x0F, 0, x0, x0, 0x010");
#endif
#endif
}

static_assert(std::atomic_bool::is_always_lock_free);

class alignas(Thread::CACHE_LINE_BYTES) SpinLock {
Expand Down
18 changes: 18 additions & 0 deletions doc/classes/OS.xml
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,24 @@
[b]Note:[/b] This method is implemented on Linux, macOS, and Windows.
</description>
</method>
<method name="precise_delay_msec" qualifiers="const">
<return type="void" />
<param index="0" name="msec" type="int" />
<description>
Delays execution of the current thread by [param msec] milliseconds with higher precision. [param msec] must be greater than or equal to [code]0[/code]. Otherwise, [method delay_msec] does nothing and prints an error message.
[b]Note:[/b] [method delay_msec] is a [i]blocking[/i] way to delay code execution. To delay code execution in a non-blocking way, you may use [method SceneTree.create_timer]. Awaiting with [SceneTreeTimer] delays the execution of code placed below the [code]await[/code] without affecting the rest of the project (or editor, for [EditorPlugin]s and [EditorScript]s).
[b]Note:[/b] When [method delay_msec] is called on the main thread, it will freeze the project and will prevent it from redrawing and registering input until the delay has passed. When using [method delay_msec] as part of an [EditorPlugin] or [EditorScript], it will freeze the editor but won't freeze the project if it is currently running (since the project is an independent child process).
</description>
</method>
<method name="precise_delay_usec" qualifiers="const">
<return type="void" />
<param index="0" name="usec" type="int" />
<description>
Delays execution of the current thread by [param usec] microseconds with higher precision. [param usec] must be greater than or equal to [code]0[/code]. Otherwise, [method delay_usec] does nothing and prints an error message.
[b]Note:[/b] [method delay_usec] is a [i]blocking[/i] way to delay code execution. To delay code execution in a non-blocking way, you may use [method SceneTree.create_timer]. Awaiting with a [SceneTreeTimer] delays the execution of code placed below the [code]await[/code] without affecting the rest of the project (or editor, for [EditorPlugin]s and [EditorScript]s).
[b]Note:[/b] When [method delay_usec] is called on the main thread, it will freeze the project and will prevent it from redrawing and registering input until the delay has passed. When using [method delay_usec] as part of an [EditorPlugin] or [EditorScript], it will freeze the editor but won't freeze the project if it is currently running (since the project is an independent child process).
</description>
</method>
<method name="read_buffer_from_stdin">
<return type="PackedByteArray" />
<param index="0" name="buffer_size" type="int" />
Expand Down
50 changes: 46 additions & 4 deletions drivers/unix/os_unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ static void _setup_clock() {
kern_return_t ret = mach_timebase_info(&info);
ERR_FAIL_COND_MSG(ret != 0, "OS CLOCK IS NOT WORKING!");
_clock_scale = ((double)info.numer / (double)info.denom) / 1000.0;
_clock_start = mach_absolute_time() * _clock_scale;
_clock_start = mach_absolute_time();
}
#else
#if defined(CLOCK_MONOTONIC_RAW) && !defined(WEB_ENABLED) // This is a better clock on Linux.
Expand Down Expand Up @@ -375,19 +375,61 @@ void OS_Unix::delay_usec(uint32_t p_usec) const {
}
}

void OS_Unix::precise_delay_usec(uint32_t p_usec) const {
thread_local double estimate = 5000.0;
thread_local double mean = 5000.0;
thread_local double m2 = 0.0;
thread_local uint64_t count = 1;

uint64_t t0 = get_ticks_usec();
uint64_t target_time = t0 + p_usec;
double usec_d = p_usec;

// Perform coarse sleep while the remaining time exceeds the estimate.
while (usec_d > estimate) {
uint64_t sleep_start = get_ticks_usec();
delay_usec(1000);
uint64_t sleep_end = get_ticks_usec();

// Compute observed time and limit to 1 second.
double observed = sleep_end - sleep_start;
usec_d -= observed;
observed = MIN(observed, 1000000.0);

// Update statistical estimates for mean and standard deviation.
if (unlikely(count > 1000000)) {
mean = (mean + observed) / 2;
m2 = 0.0;
count = 1;
}

++count;
double delta = observed - mean;
mean += delta / count;
m2 += delta * (observed - mean);
double stddev = Math::sqrt(m2 / (count - 1));
estimate = mean + stddev;
}

// Spin-wait for the remaining time.
while (get_ticks_usec() < target_time) {
_cpu_pause();
}
}

uint64_t OS_Unix::get_ticks_usec() const {
#if defined(__APPLE__)
uint64_t longtime = mach_absolute_time() * _clock_scale;
uint64_t longtime = mach_absolute_time() - _clock_start;
return elapsed_ticks * _clock_scale;
#else
// Unchecked return. Static analyzers might complain.
// If _setup_clock() succeeded, we assume clock_gettime() works.
struct timespec tv_now = { 0, 0 };
clock_gettime(GODOT_CLOCK, &tv_now);
uint64_t longtime = ((uint64_t)tv_now.tv_nsec / 1000L) + (uint64_t)tv_now.tv_sec * 1000000L;
#endif
longtime -= _clock_start;

return longtime;
#endif
}

Dictionary OS_Unix::get_memory_info() const {
Expand Down
1 change: 1 addition & 0 deletions drivers/unix/os_unix.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class OS_Unix : public OS {
virtual double get_unix_time() const override;

virtual void delay_usec(uint32_t p_usec) const override;
virtual void precise_delay_usec(uint32_t p_usec) const override;
virtual uint64_t get_ticks_usec() const override;

virtual Dictionary get_memory_info() const override;
Expand Down
10 changes: 9 additions & 1 deletion platform/windows/os_windows.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,14 @@ double OS_Windows::get_unix_time() const {
}

void OS_Windows::delay_usec(uint32_t p_usec) const {
if (p_usec < 1000) {
Sleep(1);
} else {
Sleep(p_usec / 1000);
}
}

void OS_Windows::precise_delay_usec(uint32_t p_usec) const {
constexpr uint32_t tolerance = 1000 + 20;

uint64_t t0 = get_ticks_usec();
Expand All @@ -832,7 +840,7 @@ void OS_Windows::delay_usec(uint32_t p_usec) const {

// Spin-wait until we reach the precise target time.
while (get_ticks_usec() < target_time) {
YieldProcessor();
_cpu_pause();
}
}

Expand Down
1 change: 1 addition & 0 deletions platform/windows/os_windows.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ class OS_Windows : public OS {
virtual Error set_cwd(const String &p_cwd) override;

virtual void delay_usec(uint32_t p_usec) const override;
virtual void precise_delay_usec(uint32_t p_usec) const override;
virtual uint64_t get_ticks_usec() const override;

virtual Dictionary get_memory_info() const override;
Expand Down

0 comments on commit 41b6414

Please sign in to comment.