^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * kernel/sched/loadavg.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * This file contains the magic bits required to compute the global loadavg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * figure. Its a silly number but people think its important. We go through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * great pains to make it work on big machines and tickless kernels.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include "sched.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Global load-average calculations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * We take a distributed and async approach to calculating the global load-avg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * in order to minimize overhead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * The global load average is an exponentially decaying average of nr_running +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * nr_uninterruptible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * Once every LOAD_FREQ:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * nr_active = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * for_each_possible_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * Due to a number of reasons the above turns in the mess below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * - for_each_possible_cpu() is prohibitively expensive on machines with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * serious number of CPUs, therefore we need to take a distributed approach
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * to calculating nr_active.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * So assuming nr_active := 0 when we start out -- true per definition, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * can simply take per-CPU deltas and fold those into a global accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * to obtain the same result. See calc_load_fold_active().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * Furthermore, in order to avoid synchronizing all per-CPU delta folding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * across the machine, we assume 10 ticks is sufficient time for every
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * CPU to have completed this task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * This places an upper-bound on the IRQ-off latency of the machine. Then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * again, being late doesn't loose the delta, just wrecks the sample.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * this would add another cross-CPU cacheline miss and atomic operation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * to the wakeup path. Instead we increment on whatever CPU the task ran
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * when it went into uninterruptible state and decrement on whatever CPU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * did the wakeup. This means that only the sum of nr_uninterruptible over
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * all CPUs yields the correct result.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) /* Variables and functions for calc_load */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) atomic_long_t calc_load_tasks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) unsigned long calc_load_update;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) unsigned long avenrun[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) EXPORT_SYMBOL(avenrun); /* should be removed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * get_avenrun - get the load average array
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * @loads: pointer to dest load array
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * @offset: offset to add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * @shift: shift count to shift the result left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * These values are estimates at best, so no need for locking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) loads[0] = (avenrun[0] + offset) << shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) loads[1] = (avenrun[1] + offset) << shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) loads[2] = (avenrun[2] + offset) << shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) EXPORT_SYMBOL_GPL(get_avenrun);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) long calc_load_fold_active(struct rq *this_rq, long adjust)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) long nr_active, delta = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) nr_active = this_rq->nr_running - adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) nr_active += (long)this_rq->nr_uninterruptible;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) if (nr_active != this_rq->calc_load_active) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) delta = nr_active - this_rq->calc_load_active;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) this_rq->calc_load_active = nr_active;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) return delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * fixed_power_int - compute: x^n, in O(log n) time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * @x: base of the power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * @frac_bits: fractional bits of @x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * @n: power to raise @x to.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * By exploiting the relation between the definition of the natural power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * function: x^n := x*x*...*x (x multiplied by itself for n times), and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * (where: n_i \elem {0, 1}, the binary vector representing n),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * of course trivially computable in O(log_2 n), the length of our binary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * vector.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) static unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) unsigned long result = 1UL << frac_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) if (n) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) if (n & 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) result *= x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) result += 1UL << (frac_bits - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) result >>= frac_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) n >>= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) if (!n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) x *= x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) x += 1UL << (frac_bits - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) x >>= frac_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) return result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * a1 = a0 * e + a * (1 - e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * a2 = a1 * e + a * (1 - e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * = a0 * e^2 + a * (1 - e) * (1 + e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) * a3 = a2 * e + a * (1 - e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) * ...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * = a0 * e^n + a * (1 - e^n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * [1] application of the geometric series:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * n 1 - x^(n+1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * S_n := \Sum x^i = -------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * i=0 1 - x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) calc_load_n(unsigned long load, unsigned long exp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) unsigned long active, unsigned int n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) #ifdef CONFIG_NO_HZ_COMMON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * Handle NO_HZ for the global load-average.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * Since the above described distributed algorithm to compute the global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * load-average relies on per-CPU sampling from the tick, it is affected by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * NO_HZ.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) * entering NO_HZ state such that we can include this as an 'extra' CPU delta
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * when we read the global state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * Obviously reality has to ruin such a delightfully simple scheme:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * - When we go NO_HZ idle during the window, we can negate our sample
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * contribution, causing under-accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * We avoid this by keeping two NO_HZ-delta counters and flipping them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * when the window starts, thus separating old and new NO_HZ load.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) * The only trick is the slight shift in index flip for read vs write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * 0s 5s 10s 15s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) * +10 +10 +10 +10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) * |-|-----------|-|-----------|-|-----------|-|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) * r:0 0 1 1 0 0 1 1 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) * w:0 1 1 0 0 1 1 0 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) * This ensures we'll fold the old NO_HZ contribution in this window while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) * accumlating the new one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) * - When we wake up from NO_HZ during the window, we push up our
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * contribution, since we effectively move our sample point to a known
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * busy state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * This is solved by pushing the window forward, and thus skipping the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) * was in effect at the time the window opened). This also solves the issue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * intervals.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * When making the ILB scale, we should try to pull this in as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) static atomic_long_t calc_load_nohz[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) static int calc_load_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) static inline int calc_load_write_idx(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) int idx = calc_load_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) * See calc_global_nohz(), if we observe the new index, we also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * need to observe the new update time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * If the folding window started, make sure we start writing in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * next NO_HZ-delta.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) if (!time_before(jiffies, READ_ONCE(calc_load_update)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) idx++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) return idx & 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) static inline int calc_load_read_idx(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) return calc_load_idx & 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) static void calc_load_nohz_fold(struct rq *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) long delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) delta = calc_load_fold_active(rq, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) if (delta) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) int idx = calc_load_write_idx();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) atomic_long_add(delta, &calc_load_nohz[idx]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) void calc_load_nohz_start(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) * We're going into NO_HZ mode, if there's any pending delta, fold it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) * into the pending NO_HZ delta.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) calc_load_nohz_fold(this_rq());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * Keep track of the load for NOHZ_FULL, must be called between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) * calc_load_nohz_{start,stop}().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) void calc_load_nohz_remote(struct rq *rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) calc_load_nohz_fold(rq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) void calc_load_nohz_stop(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) struct rq *this_rq = this_rq();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * If we're still before the pending sample window, we're done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) this_rq->calc_load_update = READ_ONCE(calc_load_update);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) if (time_before(jiffies, this_rq->calc_load_update))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * We woke inside or after the sample window, this means we're already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * accounted through the nohz accounting, so skip the entire deal and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * sync up for the next window.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (time_before(jiffies, this_rq->calc_load_update + 10))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) this_rq->calc_load_update += LOAD_FREQ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) static long calc_load_nohz_read(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) int idx = calc_load_read_idx();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) long delta = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) if (atomic_long_read(&calc_load_nohz[idx]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) return delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * NO_HZ can leave us missing all per-CPU ticks calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) * Once we've updated the global active value, we need to apply the exponential
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) * weights adjusted to the number of cycles missed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) static void calc_global_nohz(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) unsigned long sample_window;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) long delta, active, n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) sample_window = READ_ONCE(calc_load_update);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) if (!time_before(jiffies, sample_window + 10)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) * Catch-up, fold however many we are behind still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) delta = jiffies - sample_window - 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) n = 1 + (delta / LOAD_FREQ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) active = atomic_long_read(&calc_load_tasks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) active = active > 0 ? active * FIXED_1 : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) * Flip the NO_HZ index...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) * Make sure we first write the new time then flip the index, so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) * calc_load_write_idx() will see the new time when it reads the new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) * index, this avoids a double flip messing things up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) calc_load_idx++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) #else /* !CONFIG_NO_HZ_COMMON */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) static inline long calc_load_nohz_read(void) { return 0; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) static inline void calc_global_nohz(void) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) #endif /* CONFIG_NO_HZ_COMMON */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * calc_load - update the avenrun load estimates 10 ticks after the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) * CPUs have updated calc_load_tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) * Called from the global timer code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) void calc_global_load(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) unsigned long sample_window;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) long active, delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) sample_window = READ_ONCE(calc_load_update);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) if (time_before(jiffies, sample_window + 10))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) delta = calc_load_nohz_read();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) if (delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) atomic_long_add(delta, &calc_load_tasks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) active = atomic_long_read(&calc_load_tasks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) active = active > 0 ? active * FIXED_1 : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) avenrun[0] = calc_load(avenrun[0], EXP_1, active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) avenrun[1] = calc_load(avenrun[1], EXP_5, active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) avenrun[2] = calc_load(avenrun[2], EXP_15, active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) * In case we went to NO_HZ for multiple LOAD_FREQ intervals
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) * catch up in bulk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) calc_global_nohz();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) * Called from scheduler_tick() to periodically update this CPU's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) * active count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) void calc_global_load_tick(struct rq *this_rq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) long delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) if (time_before(jiffies, this_rq->calc_load_update))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) delta = calc_load_fold_active(this_rq, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) if (delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) atomic_long_add(delta, &calc_load_tasks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) this_rq->calc_load_update += LOAD_FREQ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) }