^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * mm/page-writeback.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2002, Linus Torvalds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Contains functions related to writing back dirty pages at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * address_space level.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * 10Apr2002 Andrew Morton
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Initial version
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/spinlock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/writeback.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/task_io_accounting_ops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/mpage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/percpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/smp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/cpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/timer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/sched/rt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <trace/events/writeback.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #undef CREATE_TRACE_POINT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <trace/hooks/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * Sleep at most 200ms at a time in balance_dirty_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #define MAX_PAUSE max(HZ/5, 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * Try to keep balance_dirty_pages() call intervals higher than this many pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * by raising pause time to max_pause when falls below it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * Estimate write bandwidth at 200ms intervals.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) #define BANDWIDTH_INTERVAL max(HZ/5, 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) #define RATELIMIT_CALC_SHIFT 10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * will look to see if it needs to force writeback or throttling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) static long ratelimit_pages = 32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) /* The following parameters are exported via /proc/sys/vm */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * Start background writeback (via writeback threads) at this percentage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) int dirty_background_ratio = 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * dirty_background_bytes starts at 0 (disabled) so that it is a function of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * dirty_background_ratio * the amount of dirtyable memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) unsigned long dirty_background_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * free highmem will not be subtracted from the total free memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * for calculating free ratios if vm_highmem_is_dirtyable is true
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) int vm_highmem_is_dirtyable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * The generator of dirty data starts writeback at this percentage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) int vm_dirty_ratio = 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * vm_dirty_ratio * the amount of dirtyable memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) unsigned long vm_dirty_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * The interval between `kupdate'-style writebacks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) EXPORT_SYMBOL_GPL(dirty_writeback_interval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * The longest time for which data is allowed to remain dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * Flag that makes the machine dump writes/reads and block dirtyings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) int block_dump;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * a full sync is triggered after this time elapses without any disk activity.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) int laptop_mode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) EXPORT_SYMBOL(laptop_mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) /* End of sysctl-exported parameters */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) struct wb_domain global_wb_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) /* consolidated parameters for balance_dirty_pages() and its subroutines */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) struct dirty_throttle_control {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) #ifdef CONFIG_CGROUP_WRITEBACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) struct wb_domain *dom;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) struct bdi_writeback *wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) struct fprop_local_percpu *wb_completions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) unsigned long avail; /* dirtyable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) unsigned long dirty; /* file_dirty + write + nfs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) unsigned long thresh; /* dirty threshold */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) unsigned long bg_thresh; /* dirty background threshold */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) unsigned long wb_dirty; /* per-wb counterparts */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) unsigned long wb_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) unsigned long wb_bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) unsigned long pos_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * Length of period for aging writeout fractions of bdis. This is an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * arbitrarily chosen number. The longer the period, the slower fractions will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * reflect changes in current writeout rate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) #ifdef CONFIG_CGROUP_WRITEBACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) #define GDTC_INIT(__wb) .wb = (__wb), \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) .dom = &global_wb_domain, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) .wb_completions = &(__wb)->completions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) #define GDTC_INIT_NO_WB .dom = &global_wb_domain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) .dom = mem_cgroup_wb_domain(__wb), \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) .wb_completions = &(__wb)->memcg_completions, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) .gdtc = __gdtc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) static bool mdtc_valid(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) return dtc->dom;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) return dtc->dom;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) return mdtc->gdtc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) return &wb->memcg_completions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) static void wb_min_max_ratio(struct bdi_writeback *wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) unsigned long *minp, unsigned long *maxp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) unsigned long this_bw = wb->avg_write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) unsigned long long min = wb->bdi->min_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) unsigned long long max = wb->bdi->max_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) * @wb may already be clean by the time control reaches here and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) * the total may not include its bw.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) if (this_bw < tot_bw) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) if (min) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) min *= this_bw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) min = div64_ul(min, tot_bw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) if (max < 100) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) max *= this_bw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) max = div64_ul(max, tot_bw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) *minp = min;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) *maxp = max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) #else /* CONFIG_CGROUP_WRITEBACK */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) #define GDTC_INIT(__wb) .wb = (__wb), \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) .wb_completions = &(__wb)->completions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) #define GDTC_INIT_NO_WB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) #define MDTC_INIT(__wb, __gdtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) static bool mdtc_valid(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) return &global_wb_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) static void wb_min_max_ratio(struct bdi_writeback *wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) unsigned long *minp, unsigned long *maxp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) *minp = wb->bdi->min_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) *maxp = wb->bdi->max_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) #endif /* CONFIG_CGROUP_WRITEBACK */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) * In a memory zone, there is a certain amount of pages we consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * available for the page cache, which is essentially the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) * free and reclaimable pages, minus some zone reserves to protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) * lowmem and the ability to uphold the zone's watermarks without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) * requiring writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) * This number of dirtyable pages is the base value of which the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) * user-configurable dirty ratio is the effective number of pages that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * are allowed to be actually dirtied. Per individual zone, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) * globally by using the sum of dirtyable pages over all zones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * Because the user is allowed to specify the dirty limit globally as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * absolute number of bytes, calculating the per-zone dirty limit can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * require translating the configured limit into a percentage of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * global dirtyable memory first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) * node_dirtyable_memory - number of dirtyable pages in a node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * @pgdat: the node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * Return: the node's number of pages potentially available for dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * page cache. This is the base value for the per-node dirty limits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) int z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) for (z = 0; z < MAX_NR_ZONES; z++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) struct zone *zone = pgdat->node_zones + z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) if (!populated_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) nr_pages += zone_page_state(zone, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * Pages reserved for the kernel should not be considered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * dirtyable, to prevent a situation where reclaim has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * clean pages in order to balance the zones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) return nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) static unsigned long highmem_dirtyable_memory(unsigned long total)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) unsigned long x = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) for_each_node_state(node, N_HIGH_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) struct zone *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) unsigned long nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) if (!is_highmem_idx(i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) z = &NODE_DATA(node)->node_zones[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) if (!populated_zone(z))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) nr_pages = zone_page_state(z, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) /* watch for underflows */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) nr_pages -= min(nr_pages, high_wmark_pages(z));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) x += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) * Unreclaimable memory (kernel memory or anonymous memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) * without swap) can bring down the dirtyable pages below
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) * the zone's dirty balance reserve and the above calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) * will underflow. However we still want to add in nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) * which are below threshold (negative values) to get a more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) * accurate calculation but make sure that the total never
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) * underflows.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) if ((long)x < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) x = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) * Make sure that the number of highmem pages is never larger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) * than the number of the total dirtyable memory. This can only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) * occur in very strange VM situations but we want to make sure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) * that this does not occur.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) return min(x, total);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * global_dirtyable_memory - number of globally dirtyable pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * Return: the global number of pages potentially available for dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * page cache. This is the base value for the global dirty limits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) static unsigned long global_dirtyable_memory(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) unsigned long x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) x = global_zone_page_state(NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * Pages reserved for the kernel should not be considered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) * dirtyable, to prevent a situation where reclaim has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * clean pages in order to balance the zones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) x -= min(x, totalreserve_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) x += global_node_page_state(NR_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) x += global_node_page_state(NR_ACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) if (!vm_highmem_is_dirtyable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) x -= highmem_dirtyable_memory(x);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) return x + 1; /* Ensure that we never return 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) * @dtc: dirty_throttle_control of interest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * Calculate @dtc->thresh and ->bg_thresh considering
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * must ensure that @dtc->avail is set before calling this function. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * dirty limits will be lifted by 1/4 for real-time tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) static void domain_dirty_limits(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) const unsigned long available_memory = dtc->avail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) unsigned long bytes = vm_dirty_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) unsigned long bg_bytes = dirty_background_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) /* convert ratios to per-PAGE_SIZE for higher precision */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) unsigned long thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) unsigned long bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) /* gdtc is !NULL iff @dtc is for memcg domain */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) if (gdtc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) unsigned long global_avail = gdtc->avail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) * The byte settings can't be applied directly to memcg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * domains. Convert them to ratios by scaling against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * globally available memory. As the ratios are in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) * per-PAGE_SIZE, they can be obtained by dividing bytes by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) * number of pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) if (bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) ratio = min(DIV_ROUND_UP(bytes, global_avail),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) if (bg_bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) bytes = bg_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) if (bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) thresh = (ratio * available_memory) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) if (bg_bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) if (bg_thresh >= thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) bg_thresh = thresh / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) if (rt_task(tsk)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) dtc->thresh = thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) dtc->bg_thresh = bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) /* we should eventually report the domain in the TP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) if (!gdtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) trace_global_dirty_state(bg_thresh, thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) * global_dirty_limits - background-writeback and dirty-throttling thresholds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * @pbackground: out parameter for bg_thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * @pdirty: out parameter for thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * Calculate bg_thresh and thresh for global_wb_domain. See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * domain_dirty_limits() for details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) gdtc.avail = global_dirtyable_memory();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) domain_dirty_limits(&gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) *pbackground = gdtc.bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) *pdirty = gdtc.thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) * node_dirty_limit - maximum number of dirty pages allowed in a node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) * @pgdat: the node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) * Return: the maximum number of dirty pages allowed in a node, based
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) * on the node's dirtyable memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) static unsigned long node_dirty_limit(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) unsigned long node_memory = node_dirtyable_memory(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) struct task_struct *tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) unsigned long dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) if (vm_dirty_bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) node_memory / global_dirtyable_memory();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) dirty = vm_dirty_ratio * node_memory / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) if (rt_task(tsk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) dirty += dirty / 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) return dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) * node_dirty_ok - tells whether a node is within its dirty limits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) * @pgdat: the node to check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * Return: %true when the dirty pages in @pgdat are within the node's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) * dirty limit, %false if the limit is exceeded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) bool node_dirty_ok(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) unsigned long limit = node_dirty_limit(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) nr_pages += node_page_state(pgdat, NR_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) return nr_pages <= limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) int dirty_background_ratio_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) void *buffer, size_t *lenp, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) if (ret == 0 && write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) dirty_background_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) int dirty_background_bytes_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) void *buffer, size_t *lenp, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) if (ret == 0 && write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) dirty_background_ratio = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) size_t *lenp, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) int old_ratio = vm_dirty_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) writeback_set_ratelimit();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) vm_dirty_bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) int dirty_bytes_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) void *buffer, size_t *lenp, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) unsigned long old_bytes = vm_dirty_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) writeback_set_ratelimit();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vm_dirty_ratio = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) static unsigned long wp_next_time(unsigned long cur_time)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) cur_time += VM_COMPLETIONS_PERIOD_LEN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) /* 0 has a special meaning... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) if (!cur_time)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) return cur_time;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) static void wb_domain_writeout_inc(struct wb_domain *dom,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) struct fprop_local_percpu *completions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) unsigned int max_prop_frac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) __fprop_inc_percpu_max(&dom->completions, completions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) max_prop_frac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) /* First event after period switching was turned off? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) if (unlikely(!dom->period_time)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) * We can race with other __bdi_writeout_inc calls here but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) * it does not cause any harm since the resulting time when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) * timer will fire and what is in writeout_period_time will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) * roughly the same.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) dom->period_time = wp_next_time(jiffies);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) mod_timer(&dom->period_timer, dom->period_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) * Increment @wb's writeout completion count and the global writeout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) * completion count. Called from test_clear_page_writeback().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) static inline void __wb_writeout_inc(struct bdi_writeback *wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) struct wb_domain *cgdom;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) inc_wb_stat(wb, WB_WRITTEN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) wb->bdi->max_prop_frac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) cgdom = mem_cgroup_wb_domain(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) if (cgdom)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) wb->bdi->max_prop_frac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) void wb_writeout_inc(struct bdi_writeback *wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) __wb_writeout_inc(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) EXPORT_SYMBOL_GPL(wb_writeout_inc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * On idle system, we can be called long after we scheduled because we use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * deferred timers so count with missed periods.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) static void writeout_period(struct timer_list *t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) struct wb_domain *dom = from_timer(dom, t, period_timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) int miss_periods = (jiffies - dom->period_time) /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) VM_COMPLETIONS_PERIOD_LEN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) if (fprop_new_period(&dom->completions, miss_periods + 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) dom->period_time = wp_next_time(dom->period_time +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) miss_periods * VM_COMPLETIONS_PERIOD_LEN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) mod_timer(&dom->period_timer, dom->period_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) * Aging has zeroed all fractions. Stop wasting CPU on period
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) * updates.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) dom->period_time = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) memset(dom, 0, sizeof(*dom));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) spin_lock_init(&dom->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) dom->dirty_limit_tstamp = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) return fprop_global_init(&dom->completions, gfp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) #ifdef CONFIG_CGROUP_WRITEBACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) void wb_domain_exit(struct wb_domain *dom)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) del_timer_sync(&dom->period_timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) fprop_global_destroy(&dom->completions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * bdi_min_ratio keeps the sum of the minimum dirty shares of all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) * registered backing devices, which, for obvious reasons, can not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * exceed 100%.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) static unsigned int bdi_min_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) spin_lock_bh(&bdi_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) if (min_ratio > bdi->max_ratio) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) min_ratio -= bdi->min_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) if (bdi_min_ratio + min_ratio < 100) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) bdi_min_ratio += min_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) bdi->min_ratio += min_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) spin_unlock_bh(&bdi_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) if (max_ratio > 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) spin_lock_bh(&bdi_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) if (bdi->min_ratio > max_ratio) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) bdi->max_ratio = max_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) spin_unlock_bh(&bdi_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) EXPORT_SYMBOL(bdi_set_max_ratio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) static unsigned long dirty_freerun_ceiling(unsigned long thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) unsigned long bg_thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) return (thresh + bg_thresh) / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) static unsigned long hard_dirty_limit(struct wb_domain *dom,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) unsigned long thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) return max(thresh, dom->dirty_limit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) * Memory which can be further allocated to a memcg domain is capped by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * system-wide clean memory excluding the amount being used in the domain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) unsigned long filepages, unsigned long headroom)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) unsigned long clean = filepages - min(filepages, mdtc->dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) unsigned long other_clean = global_clean - min(global_clean, clean);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) mdtc->avail = filepages + min(headroom, other_clean);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * __wb_calc_thresh - @wb's share of dirty throttling threshold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * @dtc: dirty_throttle_context of interest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * Note that balance_dirty_pages() will only seriously take it as a hard limit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) * when sleeping max_pause per page is not enough to keep the dirty pages under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) * control. For example, when the device is completely stalled due to some error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) * In the other normal situations, it acts more gently by throttling the tasks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) * more (rather than completely block them) when the wb dirty pages go high.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) * It allocates high/low dirty limits to fast/slow devices, in order to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) * - starving fast devices
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) * - piling up dirty pages (that will take long time to sync) on slow devices
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) * The wb's share of dirty limit will be adapting to its throughput and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) * Return: @wb's dirty limit in pages. The term "dirty" in the context of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) * dirty balancing includes all PG_dirty and PG_writeback pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) struct wb_domain *dom = dtc_dom(dtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) unsigned long thresh = dtc->thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) u64 wb_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) unsigned long numerator, denominator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) unsigned long wb_min_ratio, wb_max_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * Calculate this BDI's share of the thresh ratio.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) &numerator, &denominator);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) wb_thresh *= numerator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) wb_thresh = div64_ul(wb_thresh, denominator);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) wb_thresh += (thresh * wb_min_ratio) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) if (wb_thresh > (thresh * wb_max_ratio) / 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) wb_thresh = thresh * wb_max_ratio / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) return wb_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) .thresh = thresh };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) return __wb_calc_thresh(&gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) * setpoint - dirty 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) * f(dirty) := 1.0 + (----------------)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) * limit - setpoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) * it's a 3rd order polynomial that subjects to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) * (2) f(setpoint) = 1.0 => the balance point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) * (3) f(limit) = 0 => the hard limit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) * (4) df/dx <= 0 => negative feedback control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) * => fast response on large errors; small oscillation near setpoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) static long long pos_ratio_polynom(unsigned long setpoint,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) unsigned long dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) unsigned long limit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) long long pos_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) long x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) (limit - setpoint) | 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) pos_ratio = x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * Dirty position control.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) * (o) global/bdi setpoints
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) * We want the dirty pages be balanced around the global/wb setpoints.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) * When the number of dirty pages is higher/lower than the setpoint, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) * dirty position control ratio (and hence task dirty ratelimit) will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) * decreased/increased to bring the dirty pages back to the setpoint.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) * if (dirty < setpoint) scale up pos_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) * if (dirty > setpoint) scale down pos_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) * if (wb_dirty < wb_setpoint) scale up pos_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) * if (wb_dirty > wb_setpoint) scale down pos_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) * (o) global control line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) * ^ pos_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) * |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) * | |<===== global dirty control scope ======>|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) * 2.0 .............*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) * | .*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * 1.0 ................................*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) * | . . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) * | . . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) * | . . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) * | . . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) * | . . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) * 0 +------------.------------------.----------------------*------------->
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * freerun^ setpoint^ limit^ dirty pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) * (o) wb control line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) * ^ pos_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) * |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) * | *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) * | *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) * | *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) * | *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) * | * |<=========== span ============>|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) * 1.0 .......................*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) * | . *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) * 1/4 ...............................................* * * * * * * * * * * *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) * | . .
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) * | . .
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) * | . .
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) * 0 +----------------------.-------------------------------.------------->
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) * wb_setpoint^ x_intercept^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) * be smoothly throttled down to normal if it starts high in situations like
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) * - start writing to a slow SD card and a fast disk at the same time. The SD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) * card's wb_dirty may rush to many times higher than wb_setpoint.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) * - the wb dirty thresh drops quickly due to change of JBOD workload
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) static void wb_position_ratio(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) struct bdi_writeback *wb = dtc->wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) unsigned long write_bw = wb->avg_write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) unsigned long wb_thresh = dtc->wb_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) unsigned long x_intercept;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) unsigned long setpoint; /* dirty pages' target balance point */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) unsigned long wb_setpoint;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) unsigned long span;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) long long pos_ratio; /* for scaling up/down the rate limit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) long x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) dtc->pos_ratio = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) if (unlikely(dtc->dirty >= limit))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) * global setpoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * See comment for pos_ratio_polynom().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) setpoint = (freerun + limit) / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) * The strictlimit feature is a tool preventing mistrusted filesystems
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) * from growing a large number of dirty pages before throttling. For
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) * such filesystems balance_dirty_pages always checks wb counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) * against wb limits. Even if global "nr_dirty" is under "freerun".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) * This is especially important for fuse which sets bdi->max_ratio to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) * 1% by default. Without strictlimit feature, fuse writeback may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) * consume arbitrary amount of RAM because it is accounted in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) * Here, in wb_position_ratio(), we calculate pos_ratio based on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) * two values: wb_dirty and wb_thresh. Let's consider an example:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) * limits are set by default to 10% and 20% (background and throttle).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) * about ~6K pages (as the average of background and throttle wb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) * limits). The 3rd order polynomial will provide positive feedback if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) * wb_dirty is under wb_setpoint and vice versa.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) * Note, that we cannot use global counters in these calculations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * because we want to throttle process writing to a strictlimit wb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * in the example above).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) long long wb_pos_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) if (dtc->wb_dirty < 8) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) dtc->pos_ratio = min_t(long long, pos_ratio * 2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) 2 << RATELIMIT_CALC_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) if (dtc->wb_dirty >= wb_thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) wb_setpoint = dirty_freerun_ceiling(wb_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) dtc->wb_bg_thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) wb_thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) * Typically, for strictlimit case, wb_setpoint << setpoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) * and pos_ratio >> wb_pos_ratio. In the other words global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) * state ("dirty") is not limiting factor and we have to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) * make decision based on wb counters. But there is an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) * important case when global pos_ratio should get precedence:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) * global limits are exceeded (e.g. due to activities on other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) * wb's) while given strictlimit wb is below limit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) * "pos_ratio * wb_pos_ratio" would work for the case above,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) * but it would look too non-natural for the case of all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) * activity in the system coming from a single strictlimit wb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) * with bdi->max_ratio == 100%.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) * Note that min() below somewhat changes the dynamics of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) * control system. Normally, pos_ratio value can be well over 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) * (when globally we are at freerun and wb is well below wb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) * setpoint). Now the maximum pos_ratio in the same situation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) * is 2. We might want to tweak this if we observe the control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) * system is too slow to adapt.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * We have computed basic pos_ratio above based on global situation. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * the wb is over/under its share of dirty pages, we want to scale
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) * pos_ratio further down/up. That is done by the following mechanism.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) * wb setpoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) * x_intercept - wb_dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) * := --------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) * x_intercept - wb_setpoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) * The main wb control line is a linear function that subjects to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) * (1) f(wb_setpoint) = 1.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) * (2) k = - 1 / (8 * write_bw) (in single wb case)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) * or equally: x_intercept = wb_setpoint + 8 * write_bw
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) * For single wb case, the dirty pages are observed to fluctuate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) * regularly within range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) * for various filesystems, where (2) can yield in a reasonable 12.5%
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) * fluctuation range for pos_ratio.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) * own size, so move the slope over accordingly and choose a slope that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) if (unlikely(wb_thresh > dtc->thresh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) wb_thresh = dtc->thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) * It's very possible that wb_thresh is close to 0 not because the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) * device is slow, but that it has remained inactive for long time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) * Honour such devices a reasonable good (hopefully IO efficient)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) * threshold, so that the occasional writes won't be blocked and active
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) * writes can rampup the threshold quickly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) * scale global setpoint to wb's:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) * wb_setpoint = setpoint * wb_thresh / thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) wb_setpoint = setpoint * (u64)x >> 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) * Use span=(8*write_bw) in single wb case as indicated by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * wb_thresh thresh - wb_thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) * thresh thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) x_intercept = wb_setpoint + span;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) if (dtc->wb_dirty < x_intercept - span / 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) (x_intercept - wb_setpoint) | 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) pos_ratio /= 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) * wb reserve area, safeguard against dirty pool underrun and disk idle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) * It may push the desired control point of global dirty pages higher
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) * than setpoint.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) x_intercept = wb_thresh / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) if (dtc->wb_dirty < x_intercept) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) if (dtc->wb_dirty > x_intercept / 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) pos_ratio = div_u64(pos_ratio * x_intercept,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) dtc->wb_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) pos_ratio *= 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) dtc->pos_ratio = pos_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) static void wb_update_write_bandwidth(struct bdi_writeback *wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) unsigned long elapsed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) unsigned long written)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) const unsigned long period = roundup_pow_of_two(3 * HZ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) unsigned long avg = wb->avg_write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) unsigned long old = wb->write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) u64 bw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) * bw = written * HZ / elapsed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) * bw * elapsed + write_bandwidth * (period - elapsed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) * write_bandwidth = ---------------------------------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) * period
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) * @written may have decreased due to account_page_redirty().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) * Avoid underflowing @bw calculation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) bw = written - min(written, wb->written_stamp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) bw *= HZ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) if (unlikely(elapsed > period)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) bw = div64_ul(bw, elapsed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) avg = bw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) bw += (u64)wb->write_bandwidth * (period - elapsed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) bw >>= ilog2(period);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) * one more level of smoothing, for filtering out sudden spikes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) if (avg > old && old >= (unsigned long)bw)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) avg -= (avg - old) >> 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) if (avg < old && old <= (unsigned long)bw)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) avg += (old - avg) >> 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) avg = max(avg, 1LU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) if (wb_has_dirty_io(wb)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) long delta = avg - wb->avg_write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) WARN_ON_ONCE(atomic_long_add_return(delta,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) &wb->bdi->tot_write_bandwidth) <= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) wb->write_bandwidth = bw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) wb->avg_write_bandwidth = avg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) static void update_dirty_limit(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) struct wb_domain *dom = dtc_dom(dtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) unsigned long thresh = dtc->thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) unsigned long limit = dom->dirty_limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * Follow up in one step.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) if (limit < thresh) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) limit = thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) goto update;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * Follow down slowly. Use the higher one as the target, because thresh
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) * may drop below dirty. This is exactly the reason to introduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) * dom->dirty_limit which is guaranteed to lie above the dirty pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) thresh = max(thresh, dtc->dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) if (limit > thresh) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) limit -= (limit - thresh) >> 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) goto update;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) update:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) dom->dirty_limit = limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) unsigned long now)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) struct wb_domain *dom = dtc_dom(dtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) * check locklessly first to optimize away locking for the most time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) spin_lock(&dom->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) update_dirty_limit(dtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) dom->dirty_limit_tstamp = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) spin_unlock(&dom->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) * Normal wb tasks will be curbed at or below it in long term.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) * Obviously it should be around (write_bw / N) when there are N dd tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) unsigned long dirtied,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) unsigned long elapsed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) struct bdi_writeback *wb = dtc->wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) unsigned long dirty = dtc->dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) unsigned long setpoint = (freerun + limit) / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) unsigned long write_bw = wb->avg_write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) unsigned long dirty_ratelimit = wb->dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) unsigned long dirty_rate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) unsigned long task_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) unsigned long balanced_dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) unsigned long step;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) unsigned long x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) unsigned long shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) * The dirty rate will match the writeout rate in long term, except
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) * when dirty pages are truncated by userspace or re-dirtied by FS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * task_ratelimit reflects each dd's dirty rate for the past 200ms.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) task_ratelimit = (u64)dirty_ratelimit *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) * A linear estimation of the "balanced" throttle rate. The theory is,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) * if there are N dd tasks, each throttled at task_ratelimit, the wb's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) * dirty_rate will be measured to be (N * task_ratelimit). So the below
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) * formula will yield the balanced rate limit (write_bw / N).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) * Note that the expanded form is not a pure rate feedback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) * but also takes pos_ratio into account:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) * (1) is not realistic because pos_ratio also takes part in balancing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) * the dirty rate. Consider the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) * pos_ratio = 0.5 (3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * rate = 2 * (write_bw / N) (4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * If (1) is used, it will stuck in that state! Because each dd will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) * be throttled at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) * yielding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) * dirty_rate = N * task_ratelimit = write_bw (6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) * put (6) into (1) we get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) * rate_(i+1) = rate_(i) (7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) * So we end up using (2) to always keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) * rate_(i+1) ~= (write_bw / N) (8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) * regardless of the value of pos_ratio. As long as (8) is satisfied,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) * pos_ratio is able to drive itself to 1.0, which is not only where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) * the dirty count meet the setpoint, but also where the slope of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) dirty_rate | 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) if (unlikely(balanced_dirty_ratelimit > write_bw))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) balanced_dirty_ratelimit = write_bw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) * We could safely do this and return immediately:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) * wb->dirty_ratelimit = balanced_dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) * However to get a more stable dirty_ratelimit, the below elaborated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) * code makes use of task_ratelimit to filter out singular points and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) * limit the step size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) * The below code essentially only uses the relative value of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) * task_ratelimit - dirty_ratelimit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) * = (pos_ratio - 1) * dirty_ratelimit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) * which reflects the direction and size of dirty position error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) * dirty_ratelimit will follow balanced_dirty_ratelimit iff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) * task_ratelimit is on the same side of dirty_ratelimit, too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) * For example, when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) * - dirty_ratelimit > balanced_dirty_ratelimit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) * lowering dirty_ratelimit will help meet both the position and rate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) * control targets. Otherwise, don't update dirty_ratelimit if it will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * only help meet the rate target. After all, what the users ultimately
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) * feel and care are stable dirty rate and small position error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) * |task_ratelimit - dirty_ratelimit| is used to limit the step size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) * and filter out the singular points of balanced_dirty_ratelimit. Which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) * keeps jumping around randomly and can even leap far away at times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) * due to the small 200ms estimation period of dirty_rate (we want to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) * keep that period small to reduce time lags).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) step = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) * For strictlimit case, calculations above were based on wb counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) * and limits (starting from pos_ratio = wb_position_ratio() and up to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) * Hence, to calculate "step" properly, we have to use wb_dirty as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) * "dirty" and wb_setpoint as "setpoint".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) * We rampup dirty_ratelimit forcibly if wb_dirty is low because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) * it's possible that wb_thresh is close to zero due to inactivity
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) * of backing device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) dirty = dtc->wb_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) if (dtc->wb_dirty < 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) setpoint = dtc->wb_dirty + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) if (dirty < setpoint) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) x = min3(wb->balanced_dirty_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) balanced_dirty_ratelimit, task_ratelimit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) if (dirty_ratelimit < x)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) step = x - dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) x = max3(wb->balanced_dirty_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) balanced_dirty_ratelimit, task_ratelimit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) if (dirty_ratelimit > x)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) step = dirty_ratelimit - x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) * Don't pursue 100% rate matching. It's impossible since the balanced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) * rate itself is constantly fluctuating. So decrease the track speed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) * when it gets close to the target. Helps eliminate pointless tremors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) shift = dirty_ratelimit / (2 * step + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) if (shift < BITS_PER_LONG)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) step = DIV_ROUND_UP(step >> shift, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) step = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) if (dirty_ratelimit < balanced_dirty_ratelimit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) dirty_ratelimit += step;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) dirty_ratelimit -= step;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) struct dirty_throttle_control *mdtc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) unsigned long start_time,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) bool update_ratelimit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) struct bdi_writeback *wb = gdtc->wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) unsigned long now = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) unsigned long elapsed = now - wb->bw_time_stamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) unsigned long dirtied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) unsigned long written;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) lockdep_assert_held(&wb->list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) * rate-limit, only update once every 200ms.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) if (elapsed < BANDWIDTH_INTERVAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) * Skip quiet periods when disk bandwidth is under-utilized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) * (at least 1s idle time between two flusher runs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) goto snapshot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) if (update_ratelimit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) domain_update_bandwidth(gdtc, now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) * @mdtc is always NULL if !CGROUP_WRITEBACK but the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) * compiler has no way to figure that out. Help it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) domain_update_bandwidth(mdtc, now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) wb_update_write_bandwidth(wb, elapsed, written);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) snapshot:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) wb->dirtied_stamp = dirtied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) wb->written_stamp = written;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) wb->bw_time_stamp = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) __wb_update_bandwidth(&gdtc, NULL, start_time, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) * will look to see if it needs to start dirty throttling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) * If dirty_poll_interval is too low, big NUMA machines will call the expensive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) * (the number of pages we may dirty without exceeding the dirty limits).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) static unsigned long dirty_poll_interval(unsigned long dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) unsigned long thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) if (thresh > dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) return 1UL << (ilog2(thresh - dirty) >> 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) static unsigned long wb_max_pause(struct bdi_writeback *wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) unsigned long wb_dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) unsigned long bw = wb->avg_write_bandwidth;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) unsigned long t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) * Limit pause time for small memory systems. If sleeping for too long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) * time, a small pool of dirty/writeback pages may go empty and disk go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) * idle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) * 8 serves as the safety ratio.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) t++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) return min_t(unsigned long, t, MAX_PAUSE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) static long wb_min_pause(struct bdi_writeback *wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) long max_pause,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) unsigned long task_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) unsigned long dirty_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) int *nr_dirtied_pause)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) long hi = ilog2(wb->avg_write_bandwidth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) long lo = ilog2(wb->dirty_ratelimit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) long t; /* target pause */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) long pause; /* estimated next pause */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) int pages; /* target nr_dirtied_pause */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) /* target for 10ms pause on 1-dd case */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) t = max(1, HZ / 100);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) * Scale up pause time for concurrent dirtiers in order to reduce CPU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) * overheads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) * (N * 10ms) on 2^N concurrent tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) if (hi > lo)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) t += (hi - lo) * (10 * HZ) / 1024;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) * This is a bit convoluted. We try to base the next nr_dirtied_pause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) * on the much more stable dirty_ratelimit. However the next pause time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) * will be computed based on task_ratelimit and the two rate limits may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) * depart considerably at some time. Especially if task_ratelimit goes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) * below dirty_ratelimit/2 and the target pause is max_pause, the next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) * result task_ratelimit won't be executed faithfully, which could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) * eventually bring down dirty_ratelimit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) * We apply two rules to fix it up:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) * 1) try to estimate the next pause time and if necessary, use a lower
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) * nr_dirtied_pause so as not to exceed max_pause. When this happens,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) * nr_dirtied_pause will be "dancing" with task_ratelimit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) * 2) limit the target pause time to max_pause/2, so that the normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) * small fluctuations of task_ratelimit won't trigger rule (1) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) * nr_dirtied_pause will remain as stable as dirty_ratelimit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) t = min(t, 1 + max_pause / 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) * When the 16 consecutive reads are often interrupted by some dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) * throttling pause during the async writes, cfq will go into idles
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) * (deadline is fine). So push nr_dirtied_pause as high as possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) * until reaches DIRTY_POLL_THRESH=32 pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) if (pages < DIRTY_POLL_THRESH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) t = max_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) if (pages > DIRTY_POLL_THRESH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) pages = DIRTY_POLL_THRESH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) pause = HZ * pages / (task_ratelimit + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) if (pause > max_pause) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) t = max_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) pages = task_ratelimit * t / roundup_pow_of_two(HZ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) *nr_dirtied_pause = pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) * The minimal pause time will normally be half the target pause time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) struct bdi_writeback *wb = dtc->wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) unsigned long wb_reclaimable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) * wb_thresh is not treated as some limiting factor as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) * dirty_thresh, due to reasons
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) * - in JBOD setup, wb_thresh can fluctuate a lot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) * - in a system with HDD and USB key, the USB key may somehow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) * go into state (wb_dirty >> wb_thresh) either because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) * wb_dirty starts high, or because wb_thresh drops low.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) * In this case we don't want to hard throttle the USB key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) * dirtiers for 100 seconds until wb_dirty drops under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) * wb_thresh. Instead the auxiliary wb control line in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) * wb_position_ratio() will let the dirtier task progress
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) * at some rate <= (write_bw / 2) for bringing down wb_dirty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) dtc->wb_thresh = __wb_calc_thresh(dtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) dtc->wb_bg_thresh = dtc->thresh ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) * In order to avoid the stacked BDI deadlock we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) * to ensure we accurately count the 'dirty' pages when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) * the threshold is low.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) * Otherwise it would be possible to get thresh+n pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) * reported dirty, even though there are thresh-m pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) * actually dirty; with m+n sitting in the percpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) * deltas.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) if (dtc->wb_thresh < 2 * wb_stat_error()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) * balance_dirty_pages() must be called by processes which are generating dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) * data. It looks at the number of dirty pages in the machine and will force
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) * If we're over `background_thresh' then the writeback threads are woken to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) * perform some writeout.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) static void balance_dirty_pages(struct bdi_writeback *wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) unsigned long pages_dirtied)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) struct dirty_throttle_control * const gdtc = &gdtc_stor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) &mdtc_stor : NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) struct dirty_throttle_control *sdtc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) unsigned long nr_reclaimable; /* = file_dirty */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) long period;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) long pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) long max_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) long min_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) int nr_dirtied_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) bool dirty_exceeded = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) unsigned long task_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) unsigned long dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) struct backing_dev_info *bdi = wb->bdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) unsigned long start_time = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) unsigned long now = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) unsigned long dirty, thresh, bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) unsigned long m_dirty = 0; /* stop bogus uninit warnings */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) unsigned long m_thresh = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) unsigned long m_bg_thresh = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) gdtc->avail = global_dirtyable_memory();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) domain_dirty_limits(gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) if (unlikely(strictlimit)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) wb_dirty_limits(gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) dirty = gdtc->wb_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) thresh = gdtc->wb_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) bg_thresh = gdtc->wb_bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) dirty = gdtc->dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) thresh = gdtc->thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) bg_thresh = gdtc->bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) if (mdtc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) unsigned long filepages, headroom, writeback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) * If @wb belongs to !root memcg, repeat the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) * basic calculations for the memcg domain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) mem_cgroup_wb_stats(wb, &filepages, &headroom,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) &mdtc->dirty, &writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) mdtc->dirty += writeback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) mdtc_calc_avail(mdtc, filepages, headroom);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) domain_dirty_limits(mdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) if (unlikely(strictlimit)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) wb_dirty_limits(mdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) m_dirty = mdtc->wb_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) m_thresh = mdtc->wb_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) m_bg_thresh = mdtc->wb_bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) m_dirty = mdtc->dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) m_thresh = mdtc->thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) m_bg_thresh = mdtc->bg_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) trace_android_vh_mm_dirty_limits(gdtc, strictlimit, dirty, bg_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) nr_reclaimable, pages_dirtied);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) * Throttle it only when the background writeback cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) * catch-up. This avoids (excessively) small writeouts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) * when the wb limits are ramping up in case of !strictlimit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) * In strictlimit case make decision based on the wb counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) * and limits. Small writeouts when the wb limits are ramping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) * up are the price we consciously pay for strictlimit-ing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) * If memcg domain is in effect, @dirty should be under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) * both global and memcg freerun ceilings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) (!mdtc ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) unsigned long intv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) unsigned long m_intv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) free_running:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) intv = dirty_poll_interval(dirty, thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) m_intv = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) current->dirty_paused_when = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) current->nr_dirtied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) if (mdtc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) m_intv = dirty_poll_interval(m_dirty, m_thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) current->nr_dirtied_pause = min(intv, m_intv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) if (unlikely(!writeback_in_progress(wb)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) wb_start_background_writeback(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) mem_cgroup_flush_foreign(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) * Calculate global domain's pos_ratio and select the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) * global dtc by default.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) if (!strictlimit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) wb_dirty_limits(gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) if ((current->flags & PF_LOCAL_THROTTLE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) gdtc->wb_dirty <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) dirty_freerun_ceiling(gdtc->wb_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) gdtc->wb_bg_thresh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) * LOCAL_THROTTLE tasks must not be throttled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) * when below the per-wb freerun ceiling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) goto free_running;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) ((gdtc->dirty > gdtc->thresh) || strictlimit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) wb_position_ratio(gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) sdtc = gdtc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) if (mdtc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) * If memcg domain is in effect, calculate its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) * pos_ratio. @wb should satisfy constraints from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) * both global and memcg domains. Choose the one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) * w/ lower pos_ratio.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) if (!strictlimit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) wb_dirty_limits(mdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) if ((current->flags & PF_LOCAL_THROTTLE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) mdtc->wb_dirty <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) dirty_freerun_ceiling(mdtc->wb_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) mdtc->wb_bg_thresh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) * LOCAL_THROTTLE tasks must not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) * throttled when below the per-wb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) * freerun ceiling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) goto free_running;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) ((mdtc->dirty > mdtc->thresh) || strictlimit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) wb_position_ratio(mdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) if (mdtc->pos_ratio < gdtc->pos_ratio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) sdtc = mdtc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) if (dirty_exceeded && !wb->dirty_exceeded)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) wb->dirty_exceeded = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) if (time_is_before_jiffies(wb->bw_time_stamp +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) BANDWIDTH_INTERVAL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) spin_lock(&wb->list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) __wb_update_bandwidth(gdtc, mdtc, start_time, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) spin_unlock(&wb->list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) /* throttle according to the chosen dtc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) dirty_ratelimit = wb->dirty_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) RATELIMIT_CALC_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) max_pause = wb_max_pause(wb, sdtc->wb_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) min_pause = wb_min_pause(wb, max_pause,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) task_ratelimit, dirty_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) &nr_dirtied_pause);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) if (unlikely(task_ratelimit == 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) period = max_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) pause = max_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) goto pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) period = HZ * pages_dirtied / task_ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) pause = period;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) if (current->dirty_paused_when)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) pause -= now - current->dirty_paused_when;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) * For less than 1s think time (ext3/4 may block the dirtier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) * for up to 800ms from time to time on 1-HDD; so does xfs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) * however at much less frequency), try to compensate it in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) * future periods by updating the virtual time; otherwise just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) * do a reset, as it may be a light dirtier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) if (pause < min_pause) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) trace_balance_dirty_pages(wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) sdtc->thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) sdtc->bg_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) sdtc->dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) sdtc->wb_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) sdtc->wb_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) dirty_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) task_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) pages_dirtied,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) period,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) min(pause, 0L),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) if (pause < -HZ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) current->dirty_paused_when = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) current->nr_dirtied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) } else if (period) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) current->dirty_paused_when += period;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) current->nr_dirtied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) } else if (current->nr_dirtied_pause <= pages_dirtied)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) current->nr_dirtied_pause += pages_dirtied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) if (unlikely(pause > max_pause)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) /* for occasional dropped task_ratelimit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) now += min(pause - max_pause, max_pause);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) pause = max_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) pause:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) trace_balance_dirty_pages(wb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) sdtc->thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) sdtc->bg_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) sdtc->dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) sdtc->wb_thresh,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) sdtc->wb_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) dirty_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) task_ratelimit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) pages_dirtied,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) period,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) pause,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) __set_current_state(TASK_KILLABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) wb->dirty_sleep = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) io_schedule_timeout(pause);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) current->dirty_paused_when = now + pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) current->nr_dirtied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) current->nr_dirtied_pause = nr_dirtied_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) * This is typically equal to (dirty < thresh) and can also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) * keep "1000+ dd on a slow USB stick" under control.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) if (task_ratelimit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) * In the case of an unresponding NFS server and the NFS dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) * pages exceeds dirty_thresh, give the other good wb's a pipe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) * to go through, so that tasks on them still remain responsive.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) * In theory 1 page is enough to keep the consumer-producer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) * pipe going: the flusher cleans 1 page => the task dirties 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) * more page. However wb_dirty has accounting errors. So use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * the larger and more IO friendly wb_stat_error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) if (sdtc->wb_dirty <= wb_stat_error())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) if (!dirty_exceeded && wb->dirty_exceeded)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) wb->dirty_exceeded = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) if (writeback_in_progress(wb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) * In laptop mode, we wait until hitting the higher threshold before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) * starting background writeout, and then write out all the way down
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) * to the lower threshold. So slow writers cause minimal disk activity.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) * In normal mode, we start background writeout at the lower
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) * background_thresh, to keep the amount of dirty memory low.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) if (laptop_mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) if (nr_reclaimable > gdtc->bg_thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) wb_start_background_writeback(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) static DEFINE_PER_CPU(int, bdp_ratelimits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) * Normal tasks are throttled by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) * loop {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) * dirty tsk->nr_dirtied_pause pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) * take a snap in balance_dirty_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) * }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) * However there is a worst case. If every task exit immediately when dirtied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) * called to throttle the page dirties. The solution is to save the not yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) * throttled page dirties in dirty_throttle_leaks on task exit and charge them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) * randomly into the running tasks. This works well for the above worst case,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) * as the new task will pick up and accumulate the old task's leaked dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) * count and eventually get throttled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) * balance_dirty_pages_ratelimited - balance dirty memory state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) * @mapping: address_space which was dirtied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) * Processes which are dirtying memory should call in here once for each page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) * which was newly dirtied. The function will periodically check the system's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) * dirty state and will initiate writeback if needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) * On really big machines, get_writeback_state is expensive, so try to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) * calling it too often (ratelimiting). But once we're over the dirty memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) * limit we decrease the ratelimiting by a lot, to prevent individual processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) * from overshooting the limit by (ratelimit_pages) each.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) void balance_dirty_pages_ratelimited(struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) struct backing_dev_info *bdi = inode_to_bdi(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) struct bdi_writeback *wb = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) int ratelimit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) int *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) if (inode_cgwb_enabled(inode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) wb = wb_get_create_current(bdi, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) if (!wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) wb = &bdi->wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) ratelimit = current->nr_dirtied_pause;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) if (wb->dirty_exceeded)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) preempt_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) * This prevents one CPU to accumulate too many dirtied pages without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) * calling into balance_dirty_pages(), which can happen when there are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) * 1000+ tasks, all of them start dirtying pages at exactly the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) * time, hence all honoured too large initial task->nr_dirtied_pause.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) p = this_cpu_ptr(&bdp_ratelimits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) if (unlikely(current->nr_dirtied >= ratelimit))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) *p = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) else if (unlikely(*p >= ratelimit_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) *p = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) ratelimit = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) * Pick up the dirtied pages by the exited tasks. This avoids lots of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) * short-lived tasks (eg. gcc invocations in a kernel build) escaping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) * the dirty throttling and livelock other long-run dirtiers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) p = this_cpu_ptr(&dirty_throttle_leaks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) if (*p > 0 && current->nr_dirtied < ratelimit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) unsigned long nr_pages_dirtied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) *p -= nr_pages_dirtied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) current->nr_dirtied += nr_pages_dirtied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) preempt_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) if (unlikely(current->nr_dirtied >= ratelimit))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) balance_dirty_pages(wb, current->nr_dirtied);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) wb_put(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) * wb_over_bg_thresh - does @wb need to be written back?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) * @wb: bdi_writeback of interest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) * Determines whether background writeback should keep writing @wb or it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) * clean enough.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) * Return: %true if writeback should continue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) bool wb_over_bg_thresh(struct bdi_writeback *wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) struct dirty_throttle_control * const gdtc = &gdtc_stor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) &mdtc_stor : NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) * Similar to balance_dirty_pages() but ignores pages being written
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) * as we're trying to decide whether to put more under writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) gdtc->avail = global_dirtyable_memory();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) domain_dirty_limits(gdtc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) if (gdtc->dirty > gdtc->bg_thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) if (wb_stat(wb, WB_RECLAIMABLE) >
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) if (mdtc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) unsigned long filepages, headroom, writeback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) &writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) mdtc_calc_avail(mdtc, filepages, headroom);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) domain_dirty_limits(mdtc); /* ditto, ignore writeback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) if (mdtc->dirty > mdtc->bg_thresh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) if (wb_stat(wb, WB_RECLAIMABLE) >
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) unsigned int old_interval = dirty_writeback_interval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) ret = proc_dointvec(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) * Writing 0 to dirty_writeback_interval will disable periodic writeback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) * and a different non-zero value will wakeup the writeback threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) * wb_wakeup_delayed() would be more appropriate, but it's a pain to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) * iterate over all bdis and wbs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) * The reason we do this is to make the change take effect immediately.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) if (!ret && write && dirty_writeback_interval &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) dirty_writeback_interval != old_interval)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) wakeup_flusher_threads(WB_REASON_PERIODIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) #ifdef CONFIG_BLOCK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) void laptop_mode_timer_fn(struct timer_list *t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) struct backing_dev_info *backing_dev_info =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) from_timer(backing_dev_info, t, laptop_mode_wb_timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) * We've spun up the disk and we're in laptop mode: schedule writeback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) * of all dirty data a few seconds from now. If the flush is already scheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) * then push it back - the user is still using the disk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) void laptop_io_completion(struct backing_dev_info *info)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) * We're in laptop mode and we've just synced. The sync's writes will have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) * caused another writeback to be scheduled by laptop_io_completion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) * Nothing needs to be written back anymore, so we unschedule the writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) void laptop_sync_completion(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) struct backing_dev_info *bdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) del_timer(&bdi->laptop_mode_wb_timer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) * If ratelimit_pages is too high then we can get into dirty-data overload
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) * if a large number of processes all perform writes at the same time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) * If it is too low then SMP machines will call the (expensive)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) * get_writeback_state too often.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) * Here we set ratelimit_pages to a level which ensures that when all CPUs are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) * thresholds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) void writeback_set_ratelimit(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) struct wb_domain *dom = &global_wb_domain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) unsigned long background_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) unsigned long dirty_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) global_dirty_limits(&background_thresh, &dirty_thresh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) dom->dirty_limit = dirty_thresh;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) if (ratelimit_pages < 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) ratelimit_pages = 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) static int page_writeback_cpu_online(unsigned int cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) writeback_set_ratelimit();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) * Called early on to tune the page writeback dirty limits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) * We used to scale dirty pages according to how total memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) * related to pages that could be allocated for buffers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) * However, that was when we used "dirty_ratio" to scale with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) * all memory, and we don't do that any more. "dirty_ratio"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) * is now applied to total non-HIGHPAGE memory, and as such we can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) * get into the old insane situation any more where we had
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) * large amounts of dirty pages compared to a small amount of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) * non-HIGHMEM memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) * But we might still want to scale the dirty_ratio by how
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) * much memory the box has..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) void __init page_writeback_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) page_writeback_cpu_online, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) page_writeback_cpu_online);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) * tag_pages_for_writeback - tag pages to be written by write_cache_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) * @mapping: address space structure to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) * @start: starting page index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) * @end: ending page index (inclusive)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) * This function scans the page range from @start to @end (inclusive) and tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) * that write_cache_pages (or whoever calls this function) will then use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) * TOWRITE tag to identify pages eligible for writeback. This mechanism is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) * used to avoid livelocking of writeback by a process steadily creating new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) * dirty pages in the file (thus it is important for this function to be quick
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) * so that it can tag pages faster than a dirtying process can create them).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) void tag_pages_for_writeback(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) pgoff_t start, pgoff_t end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) XA_STATE(xas, &mapping->i_pages, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) unsigned int tagged = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) void *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) if (++tagged % XA_CHECK_SCHED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) EXPORT_SYMBOL(tag_pages_for_writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) * @mapping: address space structure to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) * @wbc: subtract the number of written pages from *@wbc->nr_to_write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) * @writepage: function called for each page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) * @data: data passed to writepage function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) * If a page is already under I/O, write_cache_pages() skips it, even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) * and msync() need to guarantee that all the data which was dirty at the time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) * the call was made get new I/O started against them. If wbc->sync_mode is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) * WB_SYNC_ALL then we were called for data integrity and we must wait for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) * existing IO to complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) * To avoid livelocks (when other process dirties new pages), we first tag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) * pages which should be written back with TOWRITE tag and only then start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) * writing them. For data-integrity sync we have to be careful so that we do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) * not miss some pages (e.g., because some other process has cleared TOWRITE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) * tag we set). The rule we follow is that TOWRITE tag can be cleared only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) * by the process clearing the DIRTY tag (and submitting the page for IO).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) * To avoid deadlocks between range_cyclic writeback and callers that hold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) * we do not loop back to the start of the file. Doing so causes a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) * lock/page writeback access order inversion - we should only ever lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) * multiple pages in ascending page->index order, and looping back to the start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) * of the file violates that rule and causes deadlocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) * Return: %0 on success, negative error code otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) int write_cache_pages(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) struct writeback_control *wbc, writepage_t writepage,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) int done = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) struct pagevec pvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) pgoff_t index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) pgoff_t end; /* Inclusive */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) pgoff_t done_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) int range_whole = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) xa_mark_t tag;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) pagevec_init(&pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) if (wbc->range_cyclic) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) index = mapping->writeback_index; /* prev offset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) end = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) index = wbc->range_start >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) end = wbc->range_end >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) range_whole = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) tag_pages_for_writeback(mapping, index, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) tag = PAGECACHE_TAG_TOWRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) tag = PAGECACHE_TAG_DIRTY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) done_index = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) while (!done && (index <= end)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) tag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) if (nr_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) for (i = 0; i < nr_pages; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) struct page *page = pvec.pages[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) done_index = page->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) * Page truncated or invalidated. We can freely skip it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) * then, even for data integrity operations: the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) * has disappeared concurrently, so there could be no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) * real expectation of this data interity operation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) * even if there is now a new, dirty page at the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) * pagecache address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) if (unlikely(page->mapping != mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) continue_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) if (!PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) /* someone wrote it for us */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) goto continue_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) if (PageWriteback(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) if (wbc->sync_mode != WB_SYNC_NONE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) wait_on_page_writeback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) goto continue_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) BUG_ON(PageWriteback(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) if (!clear_page_dirty_for_io(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) goto continue_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) error = (*writepage)(page, wbc, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) if (unlikely(error)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) * Handle errors according to the type of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) * writeback. There's no need to continue for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) * background writeback. Just push done_index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) * past this page so media errors won't choke
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) * writeout for the entire file. For integrity
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) * writeback, we must process the entire dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) * set regardless of errors because the fs may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) * still have state to clear for each page. In
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) * that case we continue processing and return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) * the first error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) if (error == AOP_WRITEPAGE_ACTIVATE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) } else if (wbc->sync_mode != WB_SYNC_ALL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) ret = error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) done_index = page->index + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) done = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) ret = error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) * We stop writing back only if we are not doing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) * integrity sync. In case of integrity sync we have to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) * keep going until we have written all the pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) * we tagged for writeback prior to entering this loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) if (--wbc->nr_to_write <= 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) wbc->sync_mode == WB_SYNC_NONE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) done = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) pagevec_release(&pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) * If we hit the last page and there is more work to be done: wrap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) * back the index back to the start of the file for the next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) * time we are called.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) if (wbc->range_cyclic && !done)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) done_index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) mapping->writeback_index = done_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) EXPORT_SYMBOL(write_cache_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) * Function used by generic_writepages to call the real writepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) * function and set the mapping flags on error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) static int __writepage(struct page *page, struct writeback_control *wbc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) struct address_space *mapping = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) int ret = mapping->a_ops->writepage(page, wbc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) mapping_set_error(mapping, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) * @mapping: address space structure to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) * @wbc: subtract the number of written pages from *@wbc->nr_to_write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) * This is a library function, which implements the writepages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) * address_space_operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) * Return: %0 on success, negative error code otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) int generic_writepages(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) struct writeback_control *wbc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) /* deal with chardevs and other special file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) if (!mapping->a_ops->writepage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) ret = write_cache_pages(mapping, wbc, __writepage, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) EXPORT_SYMBOL(generic_writepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) if (wbc->nr_to_write <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) while (1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) if (mapping->a_ops->writepages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) ret = mapping->a_ops->writepages(mapping, wbc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) ret = generic_writepages(mapping, wbc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) congestion_wait(BLK_RW_ASYNC, HZ/50);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) * write_one_page - write out a single page and wait on I/O
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) * @page: the page to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) * The page must be locked by the caller and will be unlocked upon return.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) * function returns.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) * Return: %0 on success, negative error code otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) int write_one_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) struct address_space *mapping = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) struct writeback_control wbc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) .sync_mode = WB_SYNC_ALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) .nr_to_write = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) BUG_ON(!PageLocked(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) wait_on_page_writeback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) if (clear_page_dirty_for_io(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) ret = mapping->a_ops->writepage(page, &wbc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) wait_on_page_writeback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) ret = filemap_check_errors(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) EXPORT_SYMBOL(write_one_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) * For address_spaces which do not use buffers nor write back.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) int __set_page_dirty_no_writeback(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) if (!PageDirty(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) return !TestSetPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) * Helper function for set_page_dirty family.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) * Caller must hold lock_page_memcg().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) * NOTE: This relies on being atomic wrt interrupts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) void account_page_dirtied(struct page *page, struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) trace_writeback_dirty_page(page, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) if (mapping_can_writeback(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) struct bdi_writeback *wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) inode_attach_wb(inode, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) wb = inode_to_wb(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) __inc_lruvec_page_state(page, NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) __inc_node_page_state(page, NR_DIRTIED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) inc_wb_stat(wb, WB_RECLAIMABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) inc_wb_stat(wb, WB_DIRTIED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) task_io_account_write(PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) current->nr_dirtied++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) this_cpu_inc(bdp_ratelimits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) mem_cgroup_track_foreign_dirty(page, wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) * Helper function for deaccounting dirty page without writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) * Caller must hold lock_page_memcg().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) void account_page_cleaned(struct page *page, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) struct bdi_writeback *wb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) if (mapping_can_writeback(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) dec_lruvec_page_state(page, NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) dec_wb_stat(wb, WB_RECLAIMABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) task_io_account_cancelled_write(PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) * For address_spaces which do not use buffers. Just tag the page as dirty in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) * the xarray.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) * This is also used when a single buffer is being dirtied: we want to set the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) * page dirty in that case, but not all the buffers. This is a "bottom-up"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) * The caller must ensure this doesn't race with truncation. Most will simply
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) * the pte lock held, which also locks out truncation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) int __set_page_dirty_nobuffers(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) lock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) if (!TestSetPageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) struct address_space *mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) if (!mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) unlock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) xa_lock_irqsave(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) BUG_ON(page_mapping(page) != mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) account_page_dirtied(page, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) __xa_set_mark(&mapping->i_pages, page_index(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) xa_unlock_irqrestore(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) unlock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) if (mapping->host) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) /* !PageAnon && !swapper_space */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) unlock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) EXPORT_SYMBOL(__set_page_dirty_nobuffers);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) * Call this whenever redirtying a page, to de-account the dirty counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) * systematic errors in balanced_dirty_ratelimit and the dirty pages position
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) * control.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) void account_page_redirty(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) struct address_space *mapping = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) if (mapping && mapping_can_writeback(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) struct bdi_writeback *wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) struct wb_lock_cookie cookie = {};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) wb = unlocked_inode_to_wb_begin(inode, &cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) current->nr_dirtied--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) dec_node_page_state(page, NR_DIRTIED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) dec_wb_stat(wb, WB_DIRTIED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) unlocked_inode_to_wb_end(inode, &cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) EXPORT_SYMBOL(account_page_redirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) * When a writepage implementation decides that it doesn't want to write this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) * page for some reason, it should redirty the locked page via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) * redirty_page_for_writepage() and it should then unlock the page and return 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) wbc->pages_skipped++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) ret = __set_page_dirty_nobuffers(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) account_page_redirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) EXPORT_SYMBOL(redirty_page_for_writepage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) * Dirty a page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) * For pages with a mapping this should be done under the page lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) * for the benefit of asynchronous memory errors who prefer a consistent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) * dirty state. This rule can be broken in some special cases,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) * but should be better not to.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) * If the mapping doesn't provide a set_page_dirty a_op, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) * just fall through and assume that it wants buffer_heads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) int set_page_dirty(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) struct address_space *mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) if (likely(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) * readahead/lru_deactivate_page could remain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) * PG_readahead/PG_reclaim due to race with end_page_writeback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) * About readahead, if the page is written, the flags would be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) * reset. So no problem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) * About lru_deactivate_page, if the page is redirty, the flag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) * will be reset. So no problem. but if the page is used by readahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) * it will confuse readahead and make it restart the size rampup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) * process. But it's a trivial problem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) if (PageReclaim(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) ClearPageReclaim(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) #ifdef CONFIG_BLOCK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) if (!spd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) spd = __set_page_dirty_buffers;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) return (*spd)(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) if (!PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) if (!TestSetPageDirty(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) EXPORT_SYMBOL(set_page_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) * set_page_dirty() is racy if the caller has no reference against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) * page->mapping->host, and if the page is unlocked. This is because another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) * CPU could truncate the page off the mapping and then free the mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) * Usually, the page _is_ locked, or the caller is a user-space process which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) * holds a reference on the inode by having an open file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) * In other cases, the page should be locked before running set_page_dirty().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) int set_page_dirty_lock(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) ret = set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) EXPORT_SYMBOL(set_page_dirty_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) * This cancels just the dirty bit on the kernel page itself, it does NOT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) * actually remove dirty bits on any mmap's that may be around. It also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) * leaves the page tagged dirty, so any sync activity will still find it on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) * the dirty lists, and in particular, clear_page_dirty_for_io() will still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) * look at the dirty bits in the VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) * Doing this should *normally* only ever be done when a page is truncated,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) * and is not actually mapped anywhere at all. However, fs/buffer.c does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) * this when it notices that somebody has cleaned out all the buffers on a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) * page without actually doing it through the VM. Can you say "ext3 is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) * horribly ugly"? Thought you could.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) void __cancel_dirty_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) struct address_space *mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) if (mapping_can_writeback(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) struct bdi_writeback *wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) struct wb_lock_cookie cookie = {};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) lock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) wb = unlocked_inode_to_wb_begin(inode, &cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) if (TestClearPageDirty(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) account_page_cleaned(page, mapping, wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) unlocked_inode_to_wb_end(inode, &cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) unlock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) ClearPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) EXPORT_SYMBOL(__cancel_dirty_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) * Clear a page's dirty flag, while caring for dirty memory accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) * Returns true if the page was previously dirty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) * This is for preparing to put the page under writeout. We leave the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) * tagged as dirty in the xarray so that a concurrent write-for-sync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) * implementation will run either set_page_writeback() or set_page_dirty(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) * at which stage we bring the page's dirty flag and xarray dirty tag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) * back into sync.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) * This incoherency between the page's dirty flag and xarray tag is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) * unfortunate, but it only exists while the page is locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) int clear_page_dirty_for_io(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) struct address_space *mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) VM_BUG_ON_PAGE(!PageLocked(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) if (mapping && mapping_can_writeback(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) struct bdi_writeback *wb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) struct wb_lock_cookie cookie = {};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) * Yes, Virginia, this is indeed insane.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) * We use this sequence to make sure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) * (a) we account for dirty stats properly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) * (b) we tell the low-level filesystem to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) * mark the whole page dirty if it was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) * dirty in a pagetable. Only to then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) * (c) clean the page again and return 1 to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) * cause the writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) * This way we avoid all nasty races with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) * dirty bit in multiple places and clearing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) * them concurrently from different threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) * Note! Normally the "set_page_dirty(page)"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) * has no effect on the actual dirty bit - since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) * that will already usually be set. But we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) * need the side effects, and it can help us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) * avoid races.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) * We basically use the page "master dirty bit"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) * as a serialization point for all the different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) * threads doing their things.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) if (page_mkclean(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) * We carefully synchronise fault handlers against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) * installing a dirty pte and marking the page dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) * at this point. We do this by having them hold the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) * page lock while dirtying the page, and pages are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) * always locked coming in here, so we get the desired
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) * exclusion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) wb = unlocked_inode_to_wb_begin(inode, &cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) if (TestClearPageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) dec_lruvec_page_state(page, NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) dec_wb_stat(wb, WB_RECLAIMABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) unlocked_inode_to_wb_end(inode, &cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) return TestClearPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) EXPORT_SYMBOL(clear_page_dirty_for_io);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) int test_clear_page_writeback(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) struct address_space *mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) struct mem_cgroup *memcg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) memcg = lock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) if (mapping && mapping_use_writeback_tags(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) struct backing_dev_info *bdi = inode_to_bdi(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) xa_lock_irqsave(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) ret = TestClearPageWriteback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) __xa_clear_mark(&mapping->i_pages, page_index(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) PAGECACHE_TAG_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) struct bdi_writeback *wb = inode_to_wb(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) dec_wb_stat(wb, WB_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) __wb_writeout_inc(wb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) if (mapping->host && !mapping_tagged(mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) PAGECACHE_TAG_WRITEBACK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) sb_clear_inode_writeback(mapping->host);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) xa_unlock_irqrestore(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) ret = TestClearPageWriteback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) dec_lruvec_state(lruvec, NR_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) inc_node_page_state(page, NR_WRITTEN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) __unlock_page_memcg(memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) int __test_set_page_writeback(struct page *page, bool keep_write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) struct address_space *mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) int ret, access_ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) lock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) if (mapping && mapping_use_writeback_tags(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) XA_STATE(xas, &mapping->i_pages, page_index(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) struct backing_dev_info *bdi = inode_to_bdi(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) xas_lock_irqsave(&xas, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) xas_load(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) ret = TestSetPageWriteback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) bool on_wblist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) on_wblist = mapping_tagged(mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) PAGECACHE_TAG_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) * We can come through here when swapping anonymous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) * pages, so we don't necessarily have an inode to track
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) * for sync.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) if (mapping->host && !on_wblist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) sb_mark_inode_writeback(mapping->host);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) if (!PageDirty(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) if (!keep_write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) xas_unlock_irqrestore(&xas, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) ret = TestSetPageWriteback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) inc_lruvec_page_state(page, NR_WRITEBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) unlock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) access_ret = arch_make_page_accessible(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) * If writeback has been triggered on a page that cannot be made
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) * accessible, it is too late to recover here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) VM_BUG_ON_PAGE(access_ret != 0, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) EXPORT_SYMBOL(__test_set_page_writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) * Wait for a page to complete writeback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) void wait_on_page_writeback(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) while (PageWriteback(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) trace_wait_on_page_writeback(page, page_mapping(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) wait_on_page_bit(page, PG_writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) EXPORT_SYMBOL_GPL(wait_on_page_writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) * wait_for_stable_page() - wait for writeback to finish, if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) * @page: The page to wait on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) * This function determines if the given page is related to a backing device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) * that requires page contents to be held stable during writeback. If so, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) * it will wait for any pending writeback to complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) void wait_for_stable_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) page = thp_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) wait_on_page_writeback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) EXPORT_SYMBOL_GPL(wait_for_stable_page);