From 67b96a5e18bb5b94c52c586ff3c9bf74c626555e Mon Sep 17 00:00:00 2001
From: Mes <mes900903@gmail.com>
Date: Sat, 11 Jan 2025 20:20:52 +0800
Subject: [PATCH] Scale frequency to suppress RCU CPU stall warning

Since the emulator currently operates using sequential emulation, the
execution time for the boot process is relatively long, which can result
in the generation of RCU CPU stall warnings.

To address this issue, there are several potential solutions:

1. Scale the frequency to slow down emulator time during the boot
   process, thereby eliminating RCU CPU stall warnings.
2. During the boot process, avoid using 'clock_gettime' to update ticks
   and instead manage the tick increment relationship manually.
3. Implement multi-threaded emulation to accelerate the emulator's
   execution speed.

For the third point, while implementing multi-threaded emulation can
significantly accelerate the emulator's execution speed, it cannot
guarantee that this issue will not reappear as the number of cores
increases in the future. Therefore, a better approach is to use methods
1 and 2 to allow the emulator to set an expected time for completing the
boot process.

The advantages and disadvantages of the scale method are as follows:

Advantages:
- Simple implementation
- Effectively sets the expected boot process completion time
- Results have strong interpretability
- Emulator time can be easily mapped back to real time

Disadvantages:
- Slower execution speed

The advantages and disadvantages of the increment ticks method are as
follows:

Advantages:
- Faster execution speed
- Effectively sets the expected boot process completion time

Disadvantages:
- More complex implementation
- Some results are difficult to interpret
- Emulator time is difficult to map back to real time

Based on practical tests, the second method provides limited
acceleration but introduces some significant drawbacks, such as
difficulty in interpreting results and the complexity of managing the
increment relationship. Therefore, this commit opts for the scale
frequency method to address this issue.

This commit divides time into emulator time and real time. During the
boot process, the timer uses scale frequency to slow down the growth of
emulator time, eliminating RCU CPU stall warnings. After the boot
process is complete, the growth of emulator time aligns with real time.

To configure the scale frequency parameter, three pieces of information
are required:

1. The expected completion time of the boot process
2. A reference point for estimating the boot process completion time
3. The relationship between the reference point and the number of SMPs

According to the Linux kernel documentation:
https://docs.kernel.org/RCU/stallwarn.html#config-rcu-cpu-stall-timeout

The grace period for RCU CPU stalls is typically set to 21 seconds. By
dividing this value by two as the expected completion time, we can
provide a sufficient buffer to reduce the impact of errors and avoid
RCU CPU stall warnings.

Using 'gprof' for basic statistical analysis, it was found that
'semu_timer_clocksource' accounts for approximately 10% of the boot
process execution time. Since the logic within 'semu_timer_clocksource'
is relatively simple, its execution time can be assumed to be nearly
equal to 'clock_gettime'.

Furthermore, by adding a counter to 'semu_timer_clocksource', it was
observed that each time the number of SMPs increases by 1, the execution
count of 'semu_timer_clocksource' increases by approximately '2 * 10^8'

With this information, we can estimate the boot process completion time
as 'sec_per_call * SMPs * 2 * 10^8 * (100% / 10%)' seconds, and thereby
calculate the scale frequency parameter. For instance, if the estimated
time is 200 seconds and the target time is 10 seconds, the scaling
factor would be '10 / 200'.
---
 Makefile           |   4 +-
 riscv.c            |   8 ++
 riscv.h            |   2 +-
 timer.c            | 178 +++++++++++++++++++++++++++++++++++++++++++++
 utils.h => timer.h |   4 +-
 utils.c            |  66 -----------------
 6 files changed, 193 insertions(+), 69 deletions(-)
 create mode 100644 timer.c
 rename utils.h => timer.h (62%)
 delete mode 100644 utils.c

diff --git a/Makefile b/Makefile
index 24980d4c..7881b436 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ all: $(BIN) minimal.dtb
 OBJS := \
 	riscv.o \
 	ram.o \
-	utils.o \
+	timer.o \
 	plic.o \
 	uart.o \
 	main.o \
@@ -78,6 +78,8 @@ E :=
 S := $E $E
 
 SMP ?= 1
+CFLAGS += -D SEMU_SMP=$(SMP)
+CFLAGS += -D SEMU_BOOT_TARGET_TIME=10
 .PHONY: riscv-harts.dtsi
 riscv-harts.dtsi:
 	$(Q)python3 scripts/gen-hart-dts.py $@ $(SMP) $(CLOCK_FREQ)
diff --git a/riscv.c b/riscv.c
index c3fd394a..bd92f1f0 100644
--- a/riscv.c
+++ b/riscv.c
@@ -382,6 +382,14 @@ static void op_sret(hart_t *vm)
     vm->s_mode = vm->sstatus_spp;
     vm->sstatus_sie = vm->sstatus_spie;
 
+    /* After the booting process is complete, initrd will be loaded. At this
+     * point, the sytstem will switch to U mode for the first time. Therefore,
+     * by checking whether the switch to U mode has already occurred, we can
+     * determine if the boot process has been completed.
+     */
+    if (!boot_complete && !vm->s_mode)
+        boot_complete = true;
+
     /* Reset stack */
     vm->sstatus_spp = false;
     vm->sstatus_spie = true;
diff --git a/riscv.h b/riscv.h
index 62c9cf89..70065bd6 100644
--- a/riscv.h
+++ b/riscv.h
@@ -3,7 +3,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-#include "utils.h"
+#include "timer.h"
 
 /* ERR_EXCEPTION indicates that the instruction has raised one of the
  * exceptions defined in the specification. If this flag is set, the
diff --git a/timer.c b/timer.c
new file mode 100644
index 00000000..37b50fea
--- /dev/null
+++ b/timer.c
@@ -0,0 +1,178 @@
+#include <time.h>
+
+#include "timer.h"
+
+#if defined(__APPLE__)
+#define HAVE_MACH_TIMER
+#include <mach/mach_time.h>
+#elif !defined(_WIN32) && !defined(_WIN64)
+#define HAVE_POSIX_TIMER
+
+/*
+ * Use a faster but less precise clock source because we need quick
+ * timestamps rather than fine-grained precision.
+ */
+#ifdef CLOCK_MONOTONIC_COARSE
+#define CLOCKID CLOCK_MONOTONIC_COARSE
+#else
+#define CLOCKID CLOCK_REALTIME_COARSE
+#endif
+#endif
+
+#ifndef SEMU_SMP
+#define SEMU_SMP 1
+#endif
+
+#ifndef SEMU_BOOT_TARGET_TIME
+#define SEMU_BOOT_TARGET_TIME 10
+#endif
+
+bool boot_complete = false;
+static double scale_factor;
+
+/* Calculate "x * n / d" without unnecessary overflow or loss of precision.
+ *
+ * Reference:
+ * https://elixir.bootlin.com/linux/v6.10.7/source/include/linux/math.h#L121
+ */
+static inline uint64_t mult_frac(uint64_t x, double n, uint64_t d)
+{
+    const uint64_t q = x / d;
+    const uint64_t r = x % d;
+
+    return q * n + r * n / d;
+}
+
+/* Use timespec and frequency to calculate how many ticks to increment. For
+ * example, if the frequency is set to 65,000,000, then there are 65,000,000
+ * ticks per second. Respectively, if the time is set to 1 second, then there
+ * are 65,000,000 ticks.
+ *
+ * Thus, by seconds * frequency + nanoseconds * frequency / 1,000,000,000, we
+ * can get the number of ticks.
+ */
+static inline uint64_t get_ticks(struct timespec *ts, double freq)
+{
+    return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1000000000ULL);
+}
+
+/* Measure how long a single 'clock_gettime' takes, to scale real time in order
+ * to set the emulator time.
+ */
+static void measure_bogomips_ns(uint64_t target_loop)
+{
+    struct timespec start, end;
+    clock_gettime(CLOCKID, &start);
+
+    for (uint64_t loops = 0; loops < target_loop; loops++)
+        clock_gettime(CLOCKID, &end);
+
+    int64_t sec_diff = end.tv_sec - start.tv_sec;
+    int64_t nsec_diff = end.tv_nsec - start.tv_nsec;
+    double ns_per_call = (sec_diff * 1e9 + nsec_diff) / target_loop;
+
+    /* Based on simple statistics, 'semu_timer_clocksource' accounts for
+     * approximately 10% of the boot process execution time. Since the logic
+     * inside 'semu_timer_clocksource' is relatively simple, it can be assumed
+     * that its execution time is roughly equivalent to that of a
+     * 'clock_gettime' call.
+     *
+     * Similarly, based on statistics, 'semu_timer_clocksource' is called
+     * approximately 2*1e8 times. Therefore, we can roughly estimate that the
+     * boot process will take '(ns_per_call/1e9) * SEMU_SMP * 2 * 1e8 *
+     * (100%/10%)' seconds.
+     */
+    double predict_sec = ns_per_call * SEMU_SMP * 2;
+    scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec;
+}
+
+void semu_timer_init(semu_timer_t *timer, uint64_t freq)
+{
+    measure_bogomips_ns(freq); /* Measure the time taken by 'clock_gettime' */
+
+    timer->freq = freq;
+    semu_timer_rebase(timer, 0);
+}
+
+static uint64_t semu_timer_clocksource(semu_timer_t *timer)
+{
+    /* After boot process complete, the timer will switch to real time. Thus,
+     * there is an offset between the real time and the emulator time.
+     *
+     * After switching to real time, the correct way to update time is to
+     * calculate the increment of time. Then add it to the emulator time.
+     */
+    static int64_t offset = 0;
+    static bool first_switch = true;
+
+#if defined(HAVE_POSIX_TIMER)
+    struct timespec emulator_time;
+    clock_gettime(CLOCKID, &emulator_time);
+
+    if (!boot_complete) {
+        return get_ticks(&emulator_time, timer->freq * scale_factor);
+    } else {
+        if (first_switch) {
+            first_switch = false;
+            uint64_t real_ticks = get_ticks(&emulator_time, timer->freq);
+            uint64_t scaled_ticks =
+                get_ticks(&emulator_time, timer->freq * scale_factor);
+
+            offset = (int64_t) (real_ticks - scaled_ticks);
+        }
+
+        uint64_t real_freq_ticks = get_ticks(&emulator_time, timer->freq);
+        return real_freq_ticks - offset;
+    }
+#elif defined(HAVE_MACH_TIMER)
+    static mach_timebase_info_data_t emulator_time;
+    if (emulator_time.denom == 0)
+        (void) mach_timebase_info(&emulator_time);
+
+    uint64_t now = mach_absolute_time();
+    uint64_t ns = mult_frac(now, emulator_time.numer, emulator_time.denom);
+    if (!boot_complete) {
+        return mult_frac(ns, (uint64_t) (timer->freq * scale_factor),
+                         1000000000ULL);
+    } else {
+        if (first_switch) {
+            first_switch = false;
+            uint64_t real_ticks = mult_frac(ns, timer->freq, 1000000000ULL);
+            uint64_t scaled_ticks = mult_frac(
+                ns, (uint64_t) (timer->freq * scale_factor), 1000000000ULL);
+            offset = (int64_t) (real_ticks - scaled_ticks);
+        }
+
+        uint64_t real_freq_ticks = mult_frac(ns, timer->freq, 1000000000ULL);
+        return real_freq_ticks - offset;
+    }
+#else
+    time_t now_sec = time(0);
+
+    if (!boot_complete) {
+        return ((uint64_t) now_sec) * (uint64_t) (timer->freq * scale_factor);
+    } else {
+        if (first_switch) {
+            first_switch = false;
+            uint64_t real_val = ((uint64_t) now_sec) * (uint64_t) (timer->freq);
+            uint64_t scaled_val =
+                ((uint64_t) now_sec) * (uint64_t) (timer->freq * scale_factor);
+            offset = (int64_t) real_val - (int64_t) scaled_val;
+        }
+
+        uint64_t real_freq_val =
+            ((uint64_t) now_sec) * (uint64_t) (timer->freq);
+        return real_freq_val - offset;
+    }
+#endif
+}
+
+uint64_t semu_timer_get(semu_timer_t *timer)
+{
+    return semu_timer_clocksource(timer) - timer->begin;
+}
+
+void semu_timer_rebase(semu_timer_t *timer, uint64_t time)
+{
+    timer->begin = semu_timer_clocksource(timer) - time;
+}
diff --git a/utils.h b/timer.h
similarity index 62%
rename from utils.h
rename to timer.h
index 6e03ea0f..e3144983 100644
--- a/utils.h
+++ b/timer.h
@@ -8,6 +8,8 @@ typedef struct {
     uint64_t freq;
 } semu_timer_t;
 
+extern bool boot_complete; /* complete boot process and get in initrd */
+
 void semu_timer_init(semu_timer_t *timer, uint64_t freq);
 uint64_t semu_timer_get(semu_timer_t *timer);
-void semu_timer_rebase(semu_timer_t *timer, uint64_t time);
\ No newline at end of file
+void semu_timer_rebase(semu_timer_t *timer, uint64_t time);
diff --git a/utils.c b/utils.c
deleted file mode 100644
index 29f95752..00000000
--- a/utils.c
+++ /dev/null
@@ -1,66 +0,0 @@
-#include <time.h>
-
-#include "utils.h"
-
-#if defined(__APPLE__)
-#define HAVE_MACH_TIMER
-#include <mach/mach_time.h>
-#elif !defined(_WIN32) && !defined(_WIN64)
-#define HAVE_POSIX_TIMER
-
-/*
- * Use a faster but less precise clock source because we need quick
- * timestamps rather than fine-grained precision.
- */
-#ifdef CLOCK_MONOTONIC_COARSE
-#define CLOCKID CLOCK_MONOTONIC_COARSE
-#else
-#define CLOCKID CLOCK_REALTIME_COARSE
-#endif
-#endif
-
-/* Calculate "x * n / d" without unnecessary overflow or loss of precision.
- *
- * Reference:
- * https://elixir.bootlin.com/linux/v6.10.7/source/include/linux/math.h#L121
- */
-static inline uint64_t mult_frac(uint64_t x, uint64_t n, uint64_t d)
-{
-    const uint64_t q = x / d;
-    const uint64_t r = x % d;
-
-    return q * n + r * n / d;
-}
-
-void semu_timer_init(semu_timer_t *timer, uint64_t freq)
-{
-    timer->freq = freq;
-    semu_timer_rebase(timer, 0);
-}
-
-static uint64_t semu_timer_clocksource(uint64_t freq)
-{
-#if defined(HAVE_POSIX_TIMER)
-    struct timespec t;
-    clock_gettime(CLOCKID, &t);
-    return t.tv_sec * freq + mult_frac(t.tv_nsec, freq, 1e9);
-#elif defined(HAVE_MACH_TIMER)
-    static mach_timebase_info_data_t t;
-    if (t.denom == 0)
-        (void) mach_timebase_info(&t);
-    return mult_frac(mult_frac(mach_absolute_time(), t.numer, t.denom), freq,
-                     1e9);
-#else
-    return time(0) * freq;
-#endif
-}
-
-uint64_t semu_timer_get(semu_timer_t *timer)
-{
-    return semu_timer_clocksource(timer->freq) - timer->begin;
-}
-
-void semu_timer_rebase(semu_timer_t *timer, uint64_t time)
-{
-    timer->begin = semu_timer_clocksource(timer->freq) - time;
-}