^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/mm/vmscan.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Swap reorganised 29.12.95, Stephen Tweedie.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * kswapd added: 7.1.96 sct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * Removed kswapd_ctl limits, and swap out as many pages as needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Multiqueue VM started 5.8.00, Rik van Riel.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/gfp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/kernel_stat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/vmpressure.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/vmstat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/writeback.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/buffer_head.h> /* for try_to_release_page(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) buffer_heads_over_limit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/topology.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/cpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/compaction.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/rwsem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/delay.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/kthread.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/freezer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/memcontrol.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/delayacct.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <linux/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <linux/prefetch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <linux/printk.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #include <linux/dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include <linux/psi.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #include <asm/div64.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) #include <linux/balloon_compaction.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) #include <trace/events/vmscan.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) #undef CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) #include <trace/hooks/vmscan.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_direct_reclaim_begin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_direct_reclaim_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) struct scan_control {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) /* How many pages shrink_list() should reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) unsigned long nr_to_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * Nodemask of nodes allowed by the caller. If NULL, all nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * are scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * The memory cgroup that hit its limit and as a result is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * primary target of this reclaim invocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) struct mem_cgroup *target_mem_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * Scan pressure balancing between anon and file LRUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) unsigned long anon_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) unsigned long file_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /* Can active pages be deactivated as part of reclaim? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) #define DEACTIVATE_ANON 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) #define DEACTIVATE_FILE 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) unsigned int may_deactivate:2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) unsigned int force_deactivate:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) unsigned int skipped_deactivate:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) /* Writepage batching in laptop mode; RECLAIM_WRITE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) unsigned int may_writepage:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) /* Can mapped pages be reclaimed? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) unsigned int may_unmap:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) /* Can pages be swapped as part of reclaim? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) unsigned int may_swap:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * Cgroup memory below memory.low is protected as long as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * don't threaten to OOM. If any cgroup is reclaimed at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * reduced force or passed over entirely due to its memory.low
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * setting (memcg_low_skipped), and nothing is reclaimed as a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * result, then go back for one more cycle that reclaims the protected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * memory (memcg_low_reclaim) to avert OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) unsigned int memcg_low_reclaim:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) unsigned int memcg_low_skipped:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) unsigned int hibernation_mode:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) /* One of the zones is ready for compaction */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) unsigned int compaction_ready:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) /* There is easily reclaimable cold cache in the current node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) unsigned int cache_trim_mode:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) /* The file pages on the current node are dangerously low */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) unsigned int file_is_tiny:1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) /* Allocation order */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) s8 order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) /* Scan (total_size >> priority) pages at once */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) s8 priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) /* The highest zone to isolate pages for reclaim from */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) s8 reclaim_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) /* This context's GFP mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) gfp_t gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) /* Incremented by the number of inactive pages that were scanned */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) unsigned long nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /* Number of pages freed so far during a call to shrink_zones() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) unsigned long nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) unsigned int dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) unsigned int unqueued_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) unsigned int congested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) unsigned int writeback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) unsigned int immediate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) unsigned int file_taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) unsigned int taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) } nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) /* for recording the reclaimed slab by now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) struct reclaim_state reclaim_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) #ifdef ARCH_HAS_PREFETCHW
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) #define prefetchw_prev_lru_page(_page, _base, _field) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) do { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) if ((_page)->lru.prev != _base) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) struct page *prev; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) prev = lru_to_page(&(_page->lru)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) prefetchw(&prev->_field); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) } \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) } while (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * From 0 .. 200. Higher means more swappy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) int vm_swappiness = 60;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) #define DEF_KSWAPD_THREADS_PER_NODE 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) static int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) static int __init kswapd_per_node_setup(char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) int tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) if (kstrtoint(str, 0, &tmp) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) if (tmp > MAX_KSWAPD_THREADS || tmp <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) kswapd_threads = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) __setup("kswapd_per_node=", kswapd_per_node_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) static void set_task_reclaim_state(struct task_struct *task,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) struct reclaim_state *rs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) /* Check for an overwrite */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) WARN_ON_ONCE(rs && task->reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) /* Check for the nulling of an already-nulled member */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) WARN_ON_ONCE(!rs && !task->reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) task->reclaim_state = rs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) static LIST_HEAD(shrinker_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) static DECLARE_RWSEM(shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * We allow subsystems to populate their shrinker-related
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * LRU lists before register_shrinker_prepared() is called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * for the shrinker, since we don't want to impose
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * restrictions on their internal registration order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * In this case shrink_slab_memcg() may find corresponding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * bit is set in the shrinkers map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) * This value is used by the function to detect registering
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * shrinkers and to skip do_shrink_slab() calls for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) #define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) static DEFINE_IDR(shrinker_idr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) static int shrinker_nr_max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) static int prealloc_memcg_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) int id, ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) down_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) /* This may call shrinker, so it must use down_read_trylock() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) if (id < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) if (id >= shrinker_nr_max) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) if (memcg_expand_shrinker_maps(id)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) idr_remove(&shrinker_idr, id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) shrinker_nr_max = id + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) shrinker->id = id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) up_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) static void unregister_memcg_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) int id = shrinker->id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) BUG_ON(id < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) down_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) idr_remove(&shrinker_idr, id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) up_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) static bool cgroup_reclaim(struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) return sc->target_mem_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) * writeback_throttling_sane - is the usual dirty throttling mechanism available?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * @sc: scan_control in question
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * The normal page dirty throttling mechanism in balance_dirty_pages() is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * completely broken with the legacy memcg and direct stalling in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * shrink_page_list() is used for throttling instead, which lacks all the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) * niceties such as fairness, adaptive pausing, bandwidth proportional
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) * allocation and configurability.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) * This function tests whether the vmscan currently in progress can assume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) * that the normal dirty throttling mechanism is operational.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) static bool writeback_throttling_sane(struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) if (!cgroup_reclaim(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) #ifdef CONFIG_CGROUP_WRITEBACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) static int prealloc_memcg_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) static void unregister_memcg_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) static bool cgroup_reclaim(struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) static bool writeback_throttling_sane(struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * This misses isolated pages which are not accounted for to save counters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) * As the data only determines if reclaim or compaction continues, it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * not expected that isolated pages will be a dominating factor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) unsigned long zone_reclaimable_pages(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) unsigned long nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) if (get_nr_swap_pages() > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) return nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) * lruvec_lru_size - Returns the number of pages on the given LRU list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) * @lruvec: lru vector
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) * @lru: lru to use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) unsigned long size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) int zid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) if (!mem_cgroup_disabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * Add a shrinker callback to be called from the vm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) int prealloc_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) unsigned int size = sizeof(*shrinker->nr_deferred);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) if (shrinker->flags & SHRINKER_NUMA_AWARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) size *= nr_node_ids;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) if (!shrinker->nr_deferred)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) if (prealloc_memcg_shrinker(shrinker))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) goto free_deferred;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) free_deferred:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) kfree(shrinker->nr_deferred);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) shrinker->nr_deferred = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) void free_prealloced_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) if (!shrinker->nr_deferred)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) if (shrinker->flags & SHRINKER_MEMCG_AWARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) unregister_memcg_shrinker(shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) kfree(shrinker->nr_deferred);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) shrinker->nr_deferred = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) void register_shrinker_prepared(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) down_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) list_add_tail(&shrinker->list, &shrinker_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) if (shrinker->flags & SHRINKER_MEMCG_AWARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) idr_replace(&shrinker_idr, shrinker, shrinker->id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) up_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) int register_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) int err = prealloc_shrinker(shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) register_shrinker_prepared(shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) EXPORT_SYMBOL(register_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) * Remove one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) void unregister_shrinker(struct shrinker *shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) if (!shrinker->nr_deferred)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) if (shrinker->flags & SHRINKER_MEMCG_AWARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) unregister_memcg_shrinker(shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) down_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) list_del(&shrinker->list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) up_write(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) kfree(shrinker->nr_deferred);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) shrinker->nr_deferred = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) EXPORT_SYMBOL(unregister_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) #define SHRINK_BATCH 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) struct shrinker *shrinker, int priority)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) unsigned long freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) unsigned long long delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) long total_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) long freeable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) long nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) long new_nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) int nid = shrinkctl->nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) long batch_size = shrinker->batch ? shrinker->batch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) : SHRINK_BATCH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) long scanned = 0, next_deferred;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) nid = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) freeable = shrinker->count_objects(shrinker, shrinkctl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) if (freeable == 0 || freeable == SHRINK_EMPTY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) return freeable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) * copy the current shrinker scan count into a local variable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * and zero it so that other concurrent shrinker invocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * don't also do this scanning work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) total_scan = nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) if (shrinker->seeks) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) delta = freeable >> priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) delta *= 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) do_div(delta, shrinker->seeks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) * These objects don't require any IO to create. Trim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) * them aggressively under memory pressure to keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) * them from causing refetches in the IO caches.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) delta = freeable / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) total_scan += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) if (total_scan < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) shrinker->scan_objects, total_scan);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) total_scan = freeable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) next_deferred = nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) next_deferred = total_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * We need to avoid excessive windup on filesystem shrinkers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * due to large numbers of GFP_NOFS allocations causing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * shrinkers to return -1 all the time. This results in a large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) * nr being built up so when a shrink that can do some work
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) * comes along it empties the entire cache due to nr >>>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) * freeable. This is bad for sustaining a working set in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) * memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) * Hence only allow the shrinker to scan the entire cache when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * a large delta change is calculated directly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) if (delta < freeable / 4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) total_scan = min(total_scan, freeable / 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) * Avoid risking looping forever due to too large nr value:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * never try to free more than twice the estimate number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * freeable entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) if (total_scan > freeable * 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) total_scan = freeable * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) freeable, delta, total_scan, priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) * Normally, we should not scan less than batch_size objects in one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) * pass to avoid too frequent shrinker calls, but if the slab has less
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) * than batch_size objects in total and we are really tight on memory,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) * we will try to reclaim all available objects, otherwise we can end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) * up failing allocations although there are plenty of reclaimable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) * objects spread over several slabs with usage less than the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) * batch_size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) * We detect the "tight on memory" situations by looking at the total
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * number of objects we want to scan (total_scan). If it is greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * than the total number of objects on slab (freeable), we must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * scanning at high prio and therefore should try to reclaim as much as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) while (total_scan >= batch_size ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) total_scan >= freeable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) unsigned long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) unsigned long nr_to_scan = min(batch_size, total_scan);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) shrinkctl->nr_to_scan = nr_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) shrinkctl->nr_scanned = nr_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) ret = shrinker->scan_objects(shrinker, shrinkctl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) if (ret == SHRINK_STOP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) freed += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) total_scan -= shrinkctl->nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) scanned += shrinkctl->nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) if (next_deferred >= scanned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) next_deferred -= scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) next_deferred = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) * move the unused scan count back into the shrinker in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) * manner that handles concurrent updates. If we exhausted the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) * scan, there is no need to do an update.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) if (next_deferred > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) new_nr = atomic_long_add_return(next_deferred,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) &shrinker->nr_deferred[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) return freed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) struct mem_cgroup *memcg, int priority)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) struct memcg_shrinker_map *map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) unsigned long ret, freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) if (!mem_cgroup_online(memcg))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) if (!down_read_trylock(&shrinker_rwsem))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) if (unlikely(!map))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) for_each_set_bit(i, map->map, shrinker_nr_max) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) struct shrink_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) .gfp_mask = gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) .nid = nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) .memcg = memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) struct shrinker *shrinker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) shrinker = idr_find(&shrinker_idr, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) if (!shrinker)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) clear_bit(i, map->map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* Call non-slab shrinkers even though kmem is disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) if (!memcg_kmem_enabled() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) !(shrinker->flags & SHRINKER_NONSLAB))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) ret = do_shrink_slab(&sc, shrinker, priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) if (ret == SHRINK_EMPTY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) clear_bit(i, map->map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) * After the shrinker reported that it had no objects to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) * free, but before we cleared the corresponding bit in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) * the memcg shrinker map, a new object might have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) * added. To make sure, we have the bit set in this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * case, we invoke the shrinker one more time and reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * the bit if it reports that it is not empty anymore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * The memory barrier here pairs with the barrier in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * memcg_set_shrinker_bit():
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * list_lru_add() shrink_slab_memcg()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * list_add_tail() clear_bit()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * <MB> <MB>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) * set_bit() do_shrink_slab()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) smp_mb__after_atomic();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) ret = do_shrink_slab(&sc, shrinker, priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) if (ret == SHRINK_EMPTY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) memcg_set_shrinker_bit(memcg, nid, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) freed += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) if (rwsem_is_contended(&shrinker_rwsem)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) freed = freed ? : 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) up_read(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) return freed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) #else /* CONFIG_MEMCG */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) struct mem_cgroup *memcg, int priority)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) #endif /* CONFIG_MEMCG */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) * shrink_slab - shrink slab caches
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * @gfp_mask: allocation context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) * @nid: node whose slab caches to target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) * @memcg: memory cgroup whose slab caches to target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) * @priority: the reclaim priority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) * Call the shrink functions to age shrinkable caches.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * unaware shrinkers will receive a node id of 0 instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * @memcg specifies the memory cgroup to target. Unaware shrinkers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * are called only if it is the root cgroup.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * @priority is sc->priority, we take the number of objects and >> by priority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * in order to get the scan target.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * Returns the number of reclaimed slab objects.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) unsigned long shrink_slab(gfp_t gfp_mask, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) struct mem_cgroup *memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) int priority)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) unsigned long ret, freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) struct shrinker *shrinker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) bool bypass = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) trace_android_vh_shrink_slab_bypass(gfp_mask, nid, memcg, priority, &bypass);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) if (bypass)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) * The root memcg might be allocated even though memcg is disabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) * via "cgroup_disable=memory" boot parameter. This could make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) * mem_cgroup_is_root() return false, then just run memcg slab
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) * shrink, but skip global shrink. This may result in premature
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) * oom.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) if (!down_read_trylock(&shrinker_rwsem))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) list_for_each_entry(shrinker, &shrinker_list, list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) struct shrink_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) .gfp_mask = gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) .nid = nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) .memcg = memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) ret = do_shrink_slab(&sc, shrinker, priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) if (ret == SHRINK_EMPTY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) freed += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) * Bail out if someone want to register a new shrinker to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) * prevent the registration from being stalled for long periods
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * by parallel ongoing shrinking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) if (rwsem_is_contended(&shrinker_rwsem)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) freed = freed ? : 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) up_read(&shrinker_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) return freed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) EXPORT_SYMBOL_GPL(shrink_slab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) void drop_slab_node(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) unsigned long freed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) struct mem_cgroup *memcg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) memcg = mem_cgroup_iter(NULL, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) } while (freed > 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) void drop_slab(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) for_each_online_node(nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) drop_slab_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) static inline int is_page_cache_freeable(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) * A freeable page cache page is referenced only by the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) * that isolated the page, the page cache and optional buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) * heads at page->private.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) int page_cache_pins = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) static int may_write_to_inode(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) if (current->flags & PF_SWAPWRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) if (!inode_write_congested(inode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) if (inode_to_bdi(inode) == current->backing_dev_info)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) * We detected a synchronous write error writing a page out. Probably
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) * -ENOSPC. We need to propagate that into the address_space for a subsequent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) * fsync(), msync() or close().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) * The tricky part is that after writepage we cannot touch the mapping: nothing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) * prevents it from being freed up. But we have a ref on the page and once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) * that page is locked, the mapping is pinned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) * We're allowed to run sleeping lock_page() here because we know the caller has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) * __GFP_FS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) static void handle_write_error(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) struct page *page, int error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (page_mapping(page) == mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) mapping_set_error(mapping, error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) /* possible outcome of pageout() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) typedef enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) /* failed to write page out, page is locked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) PAGE_KEEP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) /* move page to the active list, page is locked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) PAGE_ACTIVATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) /* page has been sent to the disk successfully, page is unlocked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) PAGE_SUCCESS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) /* page is clean and locked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) PAGE_CLEAN,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) } pageout_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * pageout is called by shrink_page_list() for each dirty page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * Calls ->writepage().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) static pageout_t pageout(struct page *page, struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) * If the page is dirty, only perform writeback if that write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) * will be non-blocking. To prevent this allocation from being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) * stalled by pagecache activity. But note that there may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * stalls if we need to run get_block(). We could test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) * PagePrivate for that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) * If this process is currently in __generic_file_write_iter() against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) * this page's queue, we can perform writeback even if that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * will block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * If the page is swapcache, write it back even if that would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * block, for some throttling. This happens by accident, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * swap_backing_dev_info is bust: it doesn't reflect the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) * congestion state of the swapdevs. Easy to fix, if needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) if (!is_page_cache_freeable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) return PAGE_KEEP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) if (!mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) * Some data journaling orphaned pages can have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) * page->mapping == NULL while being dirty with clean buffers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) if (page_has_private(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) if (try_to_free_buffers(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) ClearPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) pr_info("%s: orphaned page\n", __func__);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) return PAGE_CLEAN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) return PAGE_KEEP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) if (mapping->a_ops->writepage == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) return PAGE_ACTIVATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) if (!may_write_to_inode(mapping->host))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) return PAGE_KEEP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) if (clear_page_dirty_for_io(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) struct writeback_control wbc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) .sync_mode = WB_SYNC_NONE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) .nr_to_write = SWAP_CLUSTER_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) .range_start = 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) .range_end = LLONG_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) .for_reclaim = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) SetPageReclaim(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) res = mapping->a_ops->writepage(page, &wbc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) if (res < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) handle_write_error(mapping, page, res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) if (res == AOP_WRITEPAGE_ACTIVATE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) ClearPageReclaim(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) return PAGE_ACTIVATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) if (!PageWriteback(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) /* synchronous write or broken a_ops? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) ClearPageReclaim(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) trace_mm_vmscan_writepage(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) inc_node_page_state(page, NR_VMSCAN_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) return PAGE_SUCCESS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) return PAGE_CLEAN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) * Same as remove_mapping, but if the page is removed from the mapping, it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) * gets returned with a refcount of 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) static int __remove_mapping(struct address_space *mapping, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) bool reclaimed, struct mem_cgroup *target_memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) int refcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) void *shadow = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) BUG_ON(!PageLocked(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) BUG_ON(mapping != page_mapping(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) xa_lock_irqsave(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) * The non racy check for a busy page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) * Must be careful with the order of the tests. When someone has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) * a ref to the page, it may be possible that they dirty it then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) * drop the reference. So if PageDirty is tested before page_count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) * here, then the following race may occur:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) * get_user_pages(&page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) * [user mapping goes away]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) * write_to(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) * !PageDirty(page) [good]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) * SetPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) * !page_count(page) [good, discard it]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) * [oops, our write_to data is lost]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) * Reversing the order of the tests ensures such a situation cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) * escape unnoticed. The smp_rmb is needed to ensure the page->flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) * load is not satisfied before that of page->_refcount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) * Note that if SetPageDirty is always performed via set_page_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) * and thus under the i_pages lock, then this ordering is not required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) refcount = 1 + compound_nr(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) if (!page_ref_freeze(page, refcount))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) goto cannot_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) if (unlikely(PageDirty(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) page_ref_unfreeze(page, refcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) goto cannot_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) if (PageSwapCache(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) swp_entry_t swap = { .val = page_private(page) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) mem_cgroup_swapout(page, swap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) if (reclaimed && !mapping_exiting(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) shadow = workingset_eviction(page, target_memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) __delete_from_swap_cache(page, swap, shadow);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) xa_unlock_irqrestore(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) put_swap_page(page, swap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) void (*freepage)(struct page *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) freepage = mapping->a_ops->freepage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) * Remember a shadow entry for reclaimed file cache in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) * order to detect refaults, thus thrashing, later on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) * But don't store shadows in an address space that is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) * already exiting. This is not just an optimization,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * inode reclaim needs to empty out the radix tree or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * the nodes are lost. Don't plant shadows behind its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * back.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * We also don't store shadows for DAX mappings because the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) * only page cache pages found in these are zero pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) * covering holes, and because we don't want to mix DAX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) * exceptional entries and shadow exceptional entries in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) * same address_space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) if (reclaimed && page_is_file_lru(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) !mapping_exiting(mapping) && !dax_mapping(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) shadow = workingset_eviction(page, target_memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) __delete_from_page_cache(page, shadow);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) xa_unlock_irqrestore(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) if (freepage != NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) freepage(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) cannot_free:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) xa_unlock_irqrestore(&mapping->i_pages, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) * Attempt to detach a locked page from its ->mapping. If it is dirty or if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) * someone else has a ref on the page, abort and return 0. If it was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) * successfully detached, return 1. Assumes the caller has a single ref on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) * this page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) int remove_mapping(struct address_space *mapping, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) if (__remove_mapping(mapping, page, false, NULL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) * Unfreezing the refcount with 1 rather than 2 effectively
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) * drops the pagecache ref for us without requiring another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) * atomic operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) page_ref_unfreeze(page, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) * putback_lru_page - put previously isolated page onto appropriate LRU list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * @page: page to be put back to appropriate lru list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * Add previously isolated @page to appropriate LRU list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * Page may still be unevictable for other reasons.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) * lru_lock must not be held, interrupts must be enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) void putback_lru_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) lru_cache_add(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) put_page(page); /* drop ref from isolate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) enum page_references {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) PAGEREF_RECLAIM,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) PAGEREF_RECLAIM_CLEAN,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) PAGEREF_KEEP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) PAGEREF_ACTIVATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) static enum page_references page_check_references(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) int referenced_ptes, referenced_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) unsigned long vm_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) &vm_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) referenced_page = TestClearPageReferenced(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) * Mlock lost the isolation race with us. Let try_to_unmap()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * move the page to the unevictable list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if (vm_flags & VM_LOCKED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) return PAGEREF_RECLAIM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) if (referenced_ptes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) * All mapped pages start out with page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) * references from the instantiating fault, so we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * to look twice if a mapped file page is used more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) * than once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) * Mark it and spare it for another trip around the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) * inactive list. Another page table reference will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) * lead to its activation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) * Note: the mark is set for activated pages as well
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) * so that recently deactivated but used pages are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * quickly recovered.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) SetPageReferenced(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) if (referenced_page || referenced_ptes > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) return PAGEREF_ACTIVATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) * Activate file-backed executable pages after first usage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) return PAGEREF_ACTIVATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) return PAGEREF_KEEP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) /* Reclaim if clean, defer dirty pages to writeback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) if (referenced_page && !PageSwapBacked(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) return PAGEREF_RECLAIM_CLEAN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) return PAGEREF_RECLAIM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) /* Check if a page is dirty or under writeback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) static void page_check_dirty_writeback(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) bool *dirty, bool *writeback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) * Anonymous pages are not handled by flushers and must be written
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) * from reclaim context. Do not stall reclaim based on them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) if (!page_is_file_lru(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) (PageAnon(page) && !PageSwapBacked(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) *dirty = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) *writeback = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) /* By default assume that the page flags are accurate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) *dirty = PageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) *writeback = PageWriteback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) /* Verify dirty/writeback state if the filesystem supports it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) if (!page_has_private(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) if (mapping && mapping->a_ops->is_dirty_writeback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) * shrink_page_list() returns the number of reclaimed pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) static unsigned int shrink_page_list(struct list_head *page_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) struct pglist_data *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) struct scan_control *sc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) struct reclaim_stat *stat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) bool ignore_references)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) LIST_HEAD(ret_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) LIST_HEAD(free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) unsigned int nr_reclaimed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) unsigned int pgactivate = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) memset(stat, 0, sizeof(*stat));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) while (!list_empty(page_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) enum page_references references = PAGEREF_RECLAIM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) bool dirty, writeback, may_enter_fs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) unsigned int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) page = lru_to_page(page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) goto keep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) VM_BUG_ON_PAGE(PageActive(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) nr_pages = compound_nr(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) /* Account the number of base pages even though THP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) sc->nr_scanned += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) if (unlikely(!page_evictable(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) if (!sc->may_unmap && page_mapped(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) * The number of dirty pages determines if a node is marked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * reclaim_congested which affects wait_iff_congested. kswapd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * will stall and start writing pages if the tail of the LRU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) * is all dirty unqueued pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) page_check_dirty_writeback(page, &dirty, &writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) if (dirty || writeback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) stat->nr_dirty++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) if (dirty && !writeback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) stat->nr_unqueued_dirty++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) * Treat this page as congested if the underlying BDI is or if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) * pages are cycling through the LRU so quickly that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) * pages marked for immediate reclaim are making it to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) * end of the LRU a second time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) if (((dirty || writeback) && mapping &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) inode_write_congested(mapping->host)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) (writeback && PageReclaim(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) stat->nr_congested++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) * If a page at the tail of the LRU is under writeback, there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * are three cases to consider.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * 1) If reclaim is encountering an excessive number of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * under writeback and this page is both under writeback and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) * PageReclaim then it indicates that pages are being queued
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) * for IO but are being recycled through the LRU before the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) * IO can complete. Waiting on the page itself risks an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * indefinite stall if it is impossible to writeback the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) * page due to IO error or disconnected storage so instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) * note that the LRU is being scanned too quickly and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * caller can stall after page list has been processed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) * 2) Global or new memcg reclaim encounters a page that is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) * not marked for immediate reclaim, or the caller does not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) * have __GFP_FS (or __GFP_IO if it's simply going to swap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) * not to fs). In this case mark the page for immediate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) * reclaim and continue scanning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) * Require may_enter_fs because we would wait on fs, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) * may not have submitted IO yet. And the loop driver might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) * enter reclaim, and deadlock if it waits on a page for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) * which it is needed to do the write (loop masks off
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * __GFP_IO|__GFP_FS for this reason); but more thought
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * would probably show more reasons.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) * 3) Legacy memcg encounters a page that is already marked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) * PageReclaim. memcg does not have any dirty pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) * throttling so we could easily OOM just because too many
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) * pages are in writeback and there is nothing else to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) * reclaim. Wait for the writeback to complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) * In cases 1) and 2) we activate the pages to get them out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) * the way while we continue scanning for clean pages on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * inactive list and refilling from the active list. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * observation here is that waiting for disk writes is more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) * expensive than potentially causing reloads down the line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) * Since they're marked for immediate reclaim, they won't put
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) * memory pressure on the cache working set any longer than it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) * takes to write them to disk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) if (PageWriteback(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) /* Case 1 above */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) if (current_is_kswapd() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) PageReclaim(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) stat->nr_immediate++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) /* Case 2 above */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) } else if (writeback_throttling_sane(sc) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) !PageReclaim(page) || !may_enter_fs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) * This is slightly racy - end_page_writeback()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * might have just cleared PageReclaim, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * setting PageReclaim here end up interpreted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) * as PageReadahead - but that does not matter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) * enough to care. What we do want is for this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) * page to have PageReclaim set next time memcg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) * reclaim reaches the tests above, so it will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) * then wait_on_page_writeback() to avoid OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) * and it's also appropriate in global reclaim.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) SetPageReclaim(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) stat->nr_writeback++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) /* Case 3 above */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) wait_on_page_writeback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) /* then go back and try same page again */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) list_add_tail(&page->lru, page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) if (!ignore_references)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) references = page_check_references(page, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) switch (references) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) case PAGEREF_ACTIVATE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) case PAGEREF_KEEP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) stat->nr_ref_keep += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) case PAGEREF_RECLAIM:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) case PAGEREF_RECLAIM_CLEAN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) ; /* try to reclaim the page below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) * Anonymous process memory has backing store?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) * Try to allocate it some swap space here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) * Lazyfree page could be freed directly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) if (PageAnon(page) && PageSwapBacked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) if (!PageSwapCache(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) if (!(sc->gfp_mask & __GFP_IO))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) if (page_maybe_dma_pinned(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) if (PageTransHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) /* cannot split THP, skip it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) if (!can_split_huge_page(page, NULL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) * Split pages without a PMD map right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) * away. Chances are some or all of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) * tail pages can be freed without IO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) if (!compound_mapcount(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) split_huge_page_to_list(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) page_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) if (!add_to_swap(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) if (!PageTransHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) goto activate_locked_split;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) /* Fallback to swap normal pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) if (split_huge_page_to_list(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) page_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) count_vm_event(THP_SWPOUT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) if (!add_to_swap(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) goto activate_locked_split;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) may_enter_fs = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) /* Adding to swap updated mapping */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) } else if (unlikely(PageTransHuge(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) /* Split file THP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) if (split_huge_page_to_list(page, page_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) * THP may get split above, need minus tail pages and update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) * nr_pages to avoid accounting tail pages twice.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) * The tail pages that are added into swap cache successfully
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) * reach here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) if ((nr_pages > 1) && !PageTransHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) sc->nr_scanned -= (nr_pages - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) nr_pages = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) * The page is mapped into the page tables of one or more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) * processes. Try to unmap it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) if (page_mapped(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) enum ttu_flags flags = TTU_BATCH_FLUSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) bool was_swapbacked = PageSwapBacked(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) if (unlikely(PageTransHuge(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) flags |= TTU_SPLIT_HUGE_PMD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) if (!try_to_unmap(page, flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) stat->nr_unmap_fail += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) if (!was_swapbacked && PageSwapBacked(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) stat->nr_lazyfree_fail += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) if (PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) * Only kswapd can writeback filesystem pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) * to avoid risk of stack overflow. But avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) * injecting inefficient single-page IO into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) * flusher writeback as much as possible: only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) * write pages when we've encountered many
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) * dirty pages, and when we've already scanned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) * the rest of the LRU for clean pages and see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) * the same dirty pages again (PageReclaim).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) if (page_is_file_lru(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) (!current_is_kswapd() || !PageReclaim(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) * Immediately reclaim when written back.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) * Similar in principal to deactivate_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) * except we already have the page isolated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) * and know it's dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) SetPageReclaim(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) if (references == PAGEREF_RECLAIM_CLEAN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) if (!may_enter_fs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) if (!sc->may_writepage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) * Page is dirty. Flush the TLB if a writable entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) * potentially exists to avoid CPU writes after IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) * starts and then write it out here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) try_to_unmap_flush_dirty();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) switch (pageout(page, mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) case PAGE_KEEP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) case PAGE_ACTIVATE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) case PAGE_SUCCESS:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) stat->nr_pageout += thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) if (PageWriteback(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) goto keep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) if (PageDirty(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) goto keep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) * A synchronous write - probably a ramdisk. Go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) * ahead and try to reclaim the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) goto keep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) if (PageDirty(page) || PageWriteback(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) case PAGE_CLEAN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) ; /* try to free the page below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) * If the page has buffers, try to free the buffer mappings
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) * associated with this page. If we succeed we try to free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) * the page as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) * We do this even if the page is PageDirty().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) * try_to_release_page() does not perform I/O, but it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) * possible for a page to have PageDirty set, but it is actually
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) * clean (all its buffers are clean). This happens if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) * buffers were written out directly, with submit_bh(). ext3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) * will do this, as well as the blockdev mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) * try_to_release_page() will discover that cleanness and will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) * drop the buffers and mark the page clean - it can be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) * Rarely, pages can have buffers and no ->mapping. These are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) * the pages which were not successfully invalidated in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) * truncate_complete_page(). We try to drop those buffers here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) * and if that worked, and the page is no longer mapped into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) * process address space (page_count == 1) it can be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) * Otherwise, leave the page on the LRU so it is swappable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) if (page_has_private(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) if (!try_to_release_page(page, sc->gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) goto activate_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) if (!mapping && page_count(page) == 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) if (put_page_testzero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) goto free_it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) * rare race with speculative reference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) * the speculative reference will free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) * this page shortly, so we may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) * increment nr_reclaimed here (and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) * leave it off the LRU).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) nr_reclaimed++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) if (PageAnon(page) && !PageSwapBacked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) /* follow __remove_mapping for reference */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) if (!page_ref_freeze(page, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) if (PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) page_ref_unfreeze(page, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) count_vm_event(PGLAZYFREED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) count_memcg_page_event(page, PGLAZYFREED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) } else if (!mapping || !__remove_mapping(mapping, page, true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) sc->target_mem_cgroup))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) goto keep_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) free_it:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) * THP may get swapped out in a whole, need account
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) * all base pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) nr_reclaimed += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) * Is there need to periodically free_page_list? It would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) * appear not as the counts should be low
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) if (unlikely(PageTransHuge(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) destroy_compound_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) list_add(&page->lru, &free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) activate_locked_split:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) * The tail pages that are failed to add into swap cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) * reach here. Fixup nr_scanned and nr_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) if (nr_pages > 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) sc->nr_scanned -= (nr_pages - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) nr_pages = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) activate_locked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) /* Not a candidate for swapping, so reclaim swap space. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) PageMlocked(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) try_to_free_swap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) VM_BUG_ON_PAGE(PageActive(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) if (!PageMlocked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) int type = page_is_file_lru(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) SetPageActive(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) stat->nr_activate[type] += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) count_memcg_page_event(page, PGACTIVATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) keep_locked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) keep:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) list_add(&page->lru, &ret_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) mem_cgroup_uncharge_list(&free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) try_to_unmap_flush();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) free_unref_page_list(&free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) list_splice(&ret_pages, page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) count_vm_events(PGACTIVATE, pgactivate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) unsigned int reclaim_clean_pages_from_list(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) struct list_head *page_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) .gfp_mask = GFP_KERNEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) .priority = DEF_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) struct reclaim_stat stat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) unsigned int nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) struct page *page, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) LIST_HEAD(clean_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) list_for_each_entry_safe(page, next, page_list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) if (page_is_file_lru(page) && !PageDirty(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) !__PageMovable(page) && !PageUnevictable(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) ClearPageActive(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) list_move(&page->lru, &clean_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) &stat, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) list_splice(&clean_pages, page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) -(long)nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) * Since lazyfree pages are isolated from file LRU from the beginning,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) * they will rotate back to anonymous LRU in the end if it failed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) * discard so isolated count will be mismatched.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) * Compensate the isolated count for both LRU lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) stat.nr_lazyfree_fail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) -(long)stat.nr_lazyfree_fail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) int reclaim_pages_from_list(struct list_head *page_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) .gfp_mask = GFP_KERNEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) .priority = DEF_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) .may_writepage = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) .may_swap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) unsigned long nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) struct reclaim_stat dummy_stat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) list_for_each_entry(page, page_list, lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) ClearPageActive(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) nr_reclaimed = shrink_page_list(page_list, NULL, &sc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) &dummy_stat, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) while (!list_empty(page_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) page = lru_to_page(page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) dec_node_page_state(page, NR_ISOLATED_ANON +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) page_is_file_lru(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) * Attempt to remove the specified page from its LRU. Only take this page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) * if it is of the appropriate PageActive status. Pages which are being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) * freed elsewhere are also ignored.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) * page: page to consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) * mode: one of the LRU isolation modes defined above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) * returns 0 on success, -ve errno on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) int __isolate_lru_page(struct page *page, isolate_mode_t mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) int ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) /* Only take pages on the LRU. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) if (!PageLRU(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) /* Compaction should not handle unevictable pages but CMA can do so */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) * To minimise LRU disruption, the caller can indicate that it only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) * wants to isolate pages it will be able to operate on without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) * blocking - clean pages for the most part.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) * that it is possible to migrate without blocking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) if (mode & ISOLATE_ASYNC_MIGRATE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) /* All the caller can do on PageWriteback is block */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) if (PageWriteback(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) if (PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) bool migrate_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) * Only pages without mappings or that have a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) * ->migratepage callback are possible to migrate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) * without blocking. However, we can be racing with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) * truncation so it's necessary to lock the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) * to stabilise the mapping as truncation holds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) * the page lock until after the page is removed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) * from the page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) mapping = page_mapping(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) migrate_dirty = !mapping || mapping->a_ops->migratepage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) if (!migrate_dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) if (likely(get_page_unless_zero(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) * Be careful not to clear PageLRU until after we're
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) * sure the page is not being freed elsewhere -- the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) * page release code relies on it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) ClearPageLRU(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) * Update LRU sizes after isolating pages. The LRU size updates must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) * be complete before mem_cgroup_update_lru_size due to a sanity check.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) static __always_inline void update_lru_sizes(struct lruvec *lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) enum lru_list lru, unsigned long *nr_zone_taken)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) int zid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) for (zid = 0; zid < MAX_NR_ZONES; zid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) if (!nr_zone_taken[zid])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) * pgdat->lru_lock is heavily contended. Some of the functions that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) * shrink the lists perform better by taking out a batch of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) * and working on them outside the LRU lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) * For pagecache intensive workloads, this function is the hottest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) * spot in the kernel (apart from copy_*_user functions).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) * Appropriate locks must be held before calling this function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) * @nr_to_scan: The number of eligible pages to look through on the list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) * @lruvec: The LRU vector to pull pages from.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) * @dst: The temp list to put pages on to.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) * @nr_scanned: The number of pages that were scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) * @sc: The scan_control struct for this reclaim session
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) * @lru: LRU list id for isolating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) * returns how many pages were moved onto *@dst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) struct lruvec *lruvec, struct list_head *dst,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) unsigned long *nr_scanned, struct scan_control *sc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) enum lru_list lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) struct list_head *src = &lruvec->lists[lru];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) unsigned long nr_taken = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) unsigned long skipped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) unsigned long scan, total_scan, nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) LIST_HEAD(pages_skipped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) total_scan = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) scan = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) while (scan < nr_to_scan && !list_empty(src)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) page = lru_to_page(src);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) prefetchw_prev_lru_page(page, src, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) VM_BUG_ON_PAGE(!PageLRU(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) nr_pages = compound_nr(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) total_scan += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) if (page_zonenum(page) > sc->reclaim_idx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) list_move(&page->lru, &pages_skipped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) nr_skipped[page_zonenum(page)] += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) * Do not count skipped pages because that makes the function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) * return with no isolated pages if the LRU mostly contains
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) * ineligible pages. This causes the VM to not reclaim any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) * pages, triggering a premature OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) * Account all tail pages of THP. This would not cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) * premature OOM since __isolate_lru_page() returns -EBUSY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) * only when the page is being freed somewhere else.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) scan += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) switch (__isolate_lru_page(page, mode)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) case 0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) nr_taken += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) nr_zone_taken[page_zonenum(page)] += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) list_move(&page->lru, dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) case -EBUSY:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) /* else it is being freed elsewhere */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) list_move(&page->lru, src);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) * Splice any skipped pages to the start of the LRU list. Note that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) * this disrupts the LRU order when reclaiming for lower zones but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) * scanning would soon rescan the same pages to skip and put the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) * system at risk of premature OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) if (!list_empty(&pages_skipped)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) int zid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) list_splice(&pages_skipped, src);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) for (zid = 0; zid < MAX_NR_ZONES; zid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) if (!nr_skipped[zid])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) skipped += nr_skipped[zid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) *nr_scanned = total_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) total_scan, skipped, nr_taken, mode, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) update_lru_sizes(lruvec, lru, nr_zone_taken);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) return nr_taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) * isolate_lru_page - tries to isolate a page from its LRU list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) * @page: page to isolate from its LRU list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) * Isolates a @page from an LRU list, clears PageLRU and adjusts the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) * vmstat statistic corresponding to whatever LRU list the page was on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) * Returns 0 if the page was removed from an LRU list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) * Returns -EBUSY if the page was not on an LRU list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) * The returned page will have PageLRU() cleared. If it was found on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) * the active list, it will have PageActive set. If it was found on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) * the unevictable list, it will have the PageUnevictable bit set. That flag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) * may need to be cleared by the caller before letting the page go.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) * The vmstat statistic corresponding to the list on which the page was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) * found will be decremented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) * Restrictions:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) * (1) Must be called with an elevated refcount on the page. This is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) * fundamental difference from isolate_lru_pages (which is called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) * without a stable reference).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * (2) the lru_lock must not be held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * (3) interrupts must be enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) int isolate_lru_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) int ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) VM_BUG_ON_PAGE(!page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) if (PageLRU(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) pg_data_t *pgdat = page_pgdat(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) lruvec = mem_cgroup_page_lruvec(page, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) if (PageLRU(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) int lru = page_lru(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) ClearPageLRU(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) del_page_from_lru_list(page, lruvec, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) * then get rescheduled. When there are massive number of tasks doing page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) * the LRU list will go small and be scanned faster than necessary, leading to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) * unnecessary swapping, thrashing and OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) static int too_many_isolated(struct pglist_data *pgdat, int file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) unsigned long inactive, isolated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) if (current_is_kswapd())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) if (!writeback_throttling_sane(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) if (file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) * won't get blocked by normal direct-reclaimers, forming a circular
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) * deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) inactive >>= 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) return isolated > inactive;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) * This moves pages from @list to corresponding LRU list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) * We move them the other way if the page is referenced by one or more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) * processes, from rmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) * If the pages are mostly unmapped, the processing is fast and it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) * appropriate to hold zone_lru_lock across the whole operation. But if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) * the pages are mapped, the processing is slow (page_referenced()) so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) * should drop zone_lru_lock around each page. It's impossible to balance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) * this, so instead we remove the pages from the LRU while processing them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) * It is safe to rely on PG_active against the non-LRU pages in here because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) * nobody will play with that bit on a non-LRU page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) * The downside is that we have to touch page->_refcount against each page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) * But we had to alter page->flags anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) * Returns the number of pages moved to the given lruvec.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) struct pglist_data *pgdat = lruvec_pgdat(lruvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) int nr_pages, nr_moved = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) LIST_HEAD(pages_to_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) enum lru_list lru;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) while (!list_empty(list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) page = lru_to_page(list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) VM_BUG_ON_PAGE(PageLRU(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) if (unlikely(!page_evictable(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) lruvec = mem_cgroup_page_lruvec(page, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) SetPageLRU(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) lru = page_lru(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) nr_pages = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) list_move(&page->lru, &lruvec->lists[lru]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) if (put_page_testzero(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) __ClearPageLRU(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) __ClearPageActive(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) del_page_from_lru_list(page, lruvec, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) if (unlikely(PageCompound(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) destroy_compound_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) list_add(&page->lru, &pages_to_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) nr_moved += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) if (PageActive(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) workingset_age_nonresident(lruvec, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) * To save our caller's stack, now use input list for pages to free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) list_splice(&pages_to_free, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) return nr_moved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) * If a kernel thread (such as nfsd for loop-back mounts) services
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) * In that case we should only throttle if the backing device it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) * writing to is congested. In other cases it is safe to throttle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) static int current_may_throttle(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) return !(current->flags & PF_LOCAL_THROTTLE) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) current->backing_dev_info == NULL ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) bdi_write_congested(current->backing_dev_info);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) * shrink_inactive_list() is a helper for shrink_node(). It returns the number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) * of reclaimed pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) static noinline_for_stack unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) struct scan_control *sc, enum lru_list lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) LIST_HEAD(page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) unsigned long nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) unsigned int nr_reclaimed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) unsigned long nr_taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) struct reclaim_stat stat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) bool file = is_file_lru(lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) enum vm_event_item item;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) struct pglist_data *pgdat = lruvec_pgdat(lruvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) bool stalled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) while (unlikely(too_many_isolated(pgdat, file, sc))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) if (stalled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) /* wait a bit for the reclaimer. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) msleep(100);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) stalled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) /* We are about to die and free our memory. Return now. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) return SWAP_CLUSTER_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) &nr_scanned, sc, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) if (!cgroup_reclaim(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) __count_vm_events(item, nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) __count_vm_events(PGSCAN_ANON + file, nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) if (nr_taken == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) move_pages_to_lru(lruvec, &page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) lru_note_cost(lruvec, file, stat.nr_pageout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) if (!cgroup_reclaim(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) __count_vm_events(item, nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) mem_cgroup_uncharge_list(&page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) free_unref_page_list(&page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) * If dirty pages are scanned that are not queued for IO, it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) * implies that flushers are not doing their job. This can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) * happen when memory pressure pushes dirty pages to the end of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) * the LRU before the dirty limits are breached and the dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) * data has expired. It can also happen when the proportion of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) * dirty pages grows not through writes but through memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) * pressure reclaiming all the clean cache. And in some cases,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) * the flushers simply cannot keep up with the allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) * rate. Nudge the flusher threads in case they are asleep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) if (stat.nr_unqueued_dirty == nr_taken)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) wakeup_flusher_threads(WB_REASON_VMSCAN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) sc->nr.dirty += stat.nr_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) sc->nr.congested += stat.nr_congested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) sc->nr.writeback += stat.nr_writeback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) sc->nr.immediate += stat.nr_immediate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) sc->nr.taken += nr_taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) if (file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) sc->nr.file_taken += nr_taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) nr_scanned, nr_reclaimed, &stat, sc->priority, file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) static void shrink_active_list(unsigned long nr_to_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) struct lruvec *lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) struct scan_control *sc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) enum lru_list lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) unsigned long nr_taken;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) unsigned long nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) unsigned long vm_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) LIST_HEAD(l_hold); /* The pages which were snipped off */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) LIST_HEAD(l_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) LIST_HEAD(l_inactive);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) unsigned nr_deactivate, nr_activate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) unsigned nr_rotated = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) int file = is_file_lru(lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) struct pglist_data *pgdat = lruvec_pgdat(lruvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) bool bypass = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) &nr_scanned, sc, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) if (!cgroup_reclaim(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) __count_vm_events(PGREFILL, nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) while (!list_empty(&l_hold)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) page = lru_to_page(&l_hold);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) if (unlikely(!page_evictable(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) if (unlikely(buffer_heads_over_limit)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) if (page_has_private(page) && trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) if (page_has_private(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) try_to_release_page(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) trace_android_vh_page_referenced_check_bypass(page, nr_to_scan, lru, &bypass);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) if (bypass)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) goto skip_page_referenced;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) if (page_referenced(page, 0, sc->target_mem_cgroup,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) &vm_flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) * Identify referenced, file-backed active pages and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) * give them one more trip around the active list. So
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) * that executable code get better chances to stay in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) * memory under moderate memory pressure. Anon pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) * are not likely to be evicted by use-once streaming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) * IO, plus JVM can create lots of anon VM_EXEC pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) * so we ignore them here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) nr_rotated += thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) list_add(&page->lru, &l_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) skip_page_referenced:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) ClearPageActive(page); /* we are de-activating */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) SetPageWorkingset(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) list_add(&page->lru, &l_inactive);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) * Move pages back to the lru list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) nr_activate = move_pages_to_lru(lruvec, &l_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) /* Keep all free pages in l_active list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) list_splice(&l_inactive, &l_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) __count_vm_events(PGDEACTIVATE, nr_deactivate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) mem_cgroup_uncharge_list(&l_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) free_unref_page_list(&l_active);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) nr_deactivate, nr_rotated, sc->priority, file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) unsigned long reclaim_pages(struct list_head *page_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) int nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) unsigned int nr_reclaimed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) LIST_HEAD(node_page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) struct reclaim_stat dummy_stat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) .gfp_mask = GFP_KERNEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) .priority = DEF_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) .may_writepage = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) .may_swap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) while (!list_empty(page_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) page = lru_to_page(page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) if (nid == NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) INIT_LIST_HEAD(&node_page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) if (nid == page_to_nid(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) ClearPageActive(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) list_move(&page->lru, &node_page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) nr_reclaimed += shrink_page_list(&node_page_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) NODE_DATA(nid),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) &sc, &dummy_stat, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) while (!list_empty(&node_page_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) page = lru_to_page(&node_page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) if (!list_empty(&node_page_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) nr_reclaimed += shrink_page_list(&node_page_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) NODE_DATA(nid),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) &sc, &dummy_stat, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) while (!list_empty(&node_page_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) page = lru_to_page(&node_page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) struct lruvec *lruvec, struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) if (is_active_lru(lru)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) if (sc->may_deactivate & (1 << is_file_lru(lru)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) shrink_active_list(nr_to_scan, lruvec, sc, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) sc->skipped_deactivate = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) * The inactive anon list should be small enough that the VM never has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) * to do too much work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) * The inactive file list should be small enough to leave most memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) * to the established workingset on the scan-resistant active list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) * but large enough to avoid thrashing the aggregate readahead window.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) * Both inactive lists should also be large enough that each inactive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) * page has a chance to be referenced again before it is reclaimed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) * If that fails and refaulting is observed, the inactive list grows.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) * on this LRU, maintained by the pageout code. An inactive_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) * total target max
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) * memory ratio inactive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) * -------------------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) * 10MB 1 5MB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) * 100MB 1 50MB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) * 1GB 3 250MB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) * 10GB 10 0.9GB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) * 100GB 31 3GB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) * 1TB 101 10GB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) * 10TB 320 32GB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) unsigned long inactive, active;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) unsigned long inactive_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) unsigned long gb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) bool skip = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) gb = (inactive + active) >> (30 - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) trace_android_vh_inactive_is_low(gb, &inactive_ratio, inactive_lru, &skip);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) if (skip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) if (gb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) inactive_ratio = int_sqrt(10 * gb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) inactive_ratio = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) trace_android_vh_tune_inactive_ratio(&inactive_ratio, is_file_lru(inactive_lru));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) return inactive * inactive_ratio < active;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) enum scan_balance {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) SCAN_EQUAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) SCAN_FRACT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) SCAN_ANON,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) SCAN_FILE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) * Determine how aggressively the anon and file LRU lists should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) * scanned. The relative value of each set of LRU lists is determined
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) * by looking at the fraction of the pages scanned we did rotate back
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) * onto the active list instead of evict.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) unsigned long *nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) struct mem_cgroup *memcg = lruvec_memcg(lruvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) unsigned long anon_cost, file_cost, total_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) int swappiness = mem_cgroup_swappiness(memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) u64 fraction[ANON_AND_FILE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) u64 denominator = 0; /* gcc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) enum scan_balance scan_balance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) unsigned long ap, fp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) enum lru_list lru;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) bool balance_anon_file_reclaim = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) /* If we have no swap space, do not bother scanning anon pages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) scan_balance = SCAN_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) trace_android_vh_tune_swappiness(&swappiness);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) * Global reclaim will swap to prevent OOM even with no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) * swappiness, but memcg users want to use this knob to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) * disable swapping for individual groups completely when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) * using the memory controller's swap limit feature would be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) * too expensive.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) if (cgroup_reclaim(sc) && !swappiness) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) scan_balance = SCAN_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) * Do not apply any pressure balancing cleverness when the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) * system is close to OOM, scan both anon and file equally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) * (unless the swappiness setting disagrees with swapping).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) if (!sc->priority && swappiness) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) scan_balance = SCAN_EQUAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) * If the system is almost out of file pages, force-scan anon.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) if (sc->file_is_tiny) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) scan_balance = SCAN_ANON;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) trace_android_rvh_set_balance_anon_file_reclaim(&balance_anon_file_reclaim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) * If there is enough inactive page cache, we do not reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) * anything from the anonymous working right now. But when balancing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) * anon and page cache files for reclaim, allow swapping of anon pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) * even if there are a number of inactive file cache pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) if (!balance_anon_file_reclaim && sc->cache_trim_mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) scan_balance = SCAN_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) scan_balance = SCAN_FRACT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) * Calculate the pressure balance between anon and file pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) * The amount of pressure we put on each LRU is inversely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) * proportional to the cost of reclaiming each list, as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) * determined by the share of pages that are refaulting, times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) * the relative IO cost of bringing back a swapped out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) * anonymous page vs reloading a filesystem page (swappiness).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) * Although we limit that influence to ensure no list gets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) * left behind completely: at least a third of the pressure is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) * applied, before swappiness.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) * With swappiness at 100, anon and file have equal IO cost.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) total_cost = sc->anon_cost + sc->file_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) anon_cost = total_cost + sc->anon_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) file_cost = total_cost + sc->file_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) total_cost = anon_cost + file_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) ap = swappiness * (total_cost + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) ap /= anon_cost + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) fp = (200 - swappiness) * (total_cost + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) fp /= file_cost + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) fraction[0] = ap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) fraction[1] = fp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) denominator = ap + fp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) trace_android_vh_tune_scan_type((char *)(&scan_balance));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) trace_android_vh_tune_memcg_scan_type(memcg, (char *)(&scan_balance));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) for_each_evictable_lru(lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) int file = is_file_lru(lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) unsigned long lruvec_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) unsigned long low, min;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) unsigned long scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) mem_cgroup_protection(sc->target_mem_cgroup, memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) &min, &low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) if (min || low) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) * Scale a cgroup's reclaim pressure by proportioning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * its current usage to its memory.low or memory.min
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) * setting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) * This is important, as otherwise scanning aggression
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) * becomes extremely binary -- from nothing as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) * approach the memory protection threshold, to totally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) * nominal as we exceed it. This results in requiring
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) * setting extremely liberal protection thresholds. It
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) * also means we simply get no protection at all if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) * set it too low, which is not ideal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) * If there is any protection in place, we reduce scan
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) * pressure by how much of the total memory used is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) * within protection thresholds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) * There is one special case: in the first reclaim pass,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) * we skip over all groups that are within their low
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) * protection. If that fails to reclaim enough pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) * satisfy the reclaim goal, we come back and override
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) * the best-effort low protection. However, we still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) * ideally want to honor how well-behaved groups are in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) * that case instead of simply punishing them all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) * equally. As such, we reclaim them based on how much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) * memory they are using, reducing the scan pressure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) * again by how much of the total memory used is under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) * hard protection.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) unsigned long cgroup_size = mem_cgroup_size(memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) unsigned long protection;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) /* memory.low scaling, make sure we retry before OOM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) if (!sc->memcg_low_reclaim && low > min) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) protection = low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) sc->memcg_low_skipped = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) protection = min;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) /* Avoid TOCTOU with earlier protection check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) cgroup_size = max(cgroup_size, protection);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) scan = lruvec_size - lruvec_size * protection /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) (cgroup_size + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) * Minimally target SWAP_CLUSTER_MAX pages to keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) * reclaim moving forwards, avoiding decrementing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) * sc->priority further than desirable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) scan = max(scan, SWAP_CLUSTER_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) scan = lruvec_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) scan >>= sc->priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) * If the cgroup's already been deleted, make sure to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) * scrape out the remaining cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) if (!scan && !mem_cgroup_online(memcg))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) scan = min(lruvec_size, SWAP_CLUSTER_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) switch (scan_balance) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) case SCAN_EQUAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) /* Scan lists relative to size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) case SCAN_FRACT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) * Scan types proportional to swappiness and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) * their relative recent reclaim efficiency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) * Make sure we don't miss the last page on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) * the offlined memory cgroups because of a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) * round-off error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) scan = mem_cgroup_online(memcg) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) div64_u64(scan * fraction[file], denominator) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) DIV64_U64_ROUND_UP(scan * fraction[file],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) denominator);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) case SCAN_FILE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) case SCAN_ANON:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) /* Scan one type exclusively */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) if ((scan_balance == SCAN_FILE) != file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) scan = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) /* Look ma, no brain */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) nr[lru] = scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) unsigned long nr[NR_LRU_LISTS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) unsigned long targets[NR_LRU_LISTS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) unsigned long nr_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) enum lru_list lru;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) unsigned long nr_reclaimed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) unsigned long nr_to_reclaim = sc->nr_to_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) bool scan_adjusted;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) get_scan_count(lruvec, sc, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) /* Record the original scan target for proportional adjustments later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) memcpy(targets, nr, sizeof(nr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) * event that can occur when there is little memory pressure e.g.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) * multiple streaming readers/writers. Hence, we do not abort scanning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) * when the requested number of pages are reclaimed when scanning at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) * DEF_PRIORITY on the assumption that the fact we are direct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) * reclaiming implies that kswapd is not keeping up and it is best to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) * do a batch of work at once. For memcg reclaim one check is made to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) * abort proportional reclaim if either the file or anon lru has already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) * dropped to zero at the first pass.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) sc->priority == DEF_PRIORITY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) nr[LRU_INACTIVE_FILE]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) unsigned long nr_anon, nr_file, percentage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) unsigned long nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) for_each_evictable_lru(lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) if (nr[lru]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) nr[lru] -= nr_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) nr_reclaimed += shrink_list(lru, nr_to_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) lruvec, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) * For kswapd and memcg, reclaim at least the number of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) * requested. Ensure that the anon and file LRUs are scanned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) * proportionally what was requested by get_scan_count(). We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) * stop reclaiming one LRU and reduce the amount scanning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) * proportional to the original scan target.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) * It's just vindictive to attack the larger once the smaller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) * has gone to zero. And given the way we stop scanning the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) * smaller below, this makes sure that we only make one nudge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) * towards proportionality once we've got nr_to_reclaim.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) if (!nr_file || !nr_anon)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) if (nr_file > nr_anon) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) targets[LRU_ACTIVE_ANON] + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) lru = LRU_BASE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) percentage = nr_anon * 100 / scan_target;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) targets[LRU_ACTIVE_FILE] + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) lru = LRU_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) percentage = nr_file * 100 / scan_target;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) /* Stop scanning the smaller of the LRU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) nr[lru] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) nr[lru + LRU_ACTIVE] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) * Recalculate the other LRU scan count based on its original
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) * scan target and the percentage scanning already complete
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) nr_scanned = targets[lru] - nr[lru];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) nr[lru] = targets[lru] * (100 - percentage) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) nr[lru] -= min(nr[lru], nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) lru += LRU_ACTIVE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) nr_scanned = targets[lru] - nr[lru];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) nr[lru] = targets[lru] * (100 - percentage) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) nr[lru] -= min(nr[lru], nr_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) scan_adjusted = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) sc->nr_reclaimed += nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) * Even if we did not try to evict anon pages at all, we want to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) * rebalance the anon lru active/inactive ratio.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) sc, LRU_ACTIVE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) /* Use reclaim/compaction for costly allocs or under memory pressure */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) static bool in_reclaim_compaction(struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) sc->priority < DEF_PRIORITY - 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) * Reclaim/compaction is used for high-order allocation requests. It reclaims
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) * order-0 pages before compacting the zone. should_continue_reclaim() returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) * true if more pages should be reclaimed such that when the page allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) * calls try_to_compact_pages() that it will have enough free pages to succeed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) * It will give up earlier than that if there is difficulty reclaiming pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) static inline bool should_continue_reclaim(struct pglist_data *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) unsigned long nr_reclaimed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) unsigned long pages_for_compaction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) unsigned long inactive_lru_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) int z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) /* If not in reclaim/compaction mode, stop */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) if (!in_reclaim_compaction(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) * number of pages that were scanned. This will return to the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) * with the risk reclaim/compaction and the resulting allocation attempt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) * allocations through requiring that the full LRU list has been scanned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) * first, by assuming that zero delta of sc->nr_scanned means full LRU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) * scan, but that approximation was wrong, and there were corner cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) * where always a non-zero amount of pages were scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) if (!nr_reclaimed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) /* If compaction would go ahead or the allocation would succeed, stop */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) for (z = 0; z <= sc->reclaim_idx; z++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) struct zone *zone = &pgdat->node_zones[z];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) case COMPACT_SUCCESS:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) case COMPACT_CONTINUE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) /* check next zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) * If we have not reclaimed enough pages for compaction and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) * inactive lists are large enough, continue reclaiming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) pages_for_compaction = compact_gap(sc->order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) if (get_nr_swap_pages() > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) return inactive_lru_pages > pages_for_compaction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) struct mem_cgroup *memcg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) unsigned long reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) unsigned long scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) bool skip = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) * This loop can become CPU-bound when target memcgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) * aren't eligible for reclaim - either because they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) * don't have any reclaimable pages, or because their
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) * memory is explicitly protected. Avoid soft lockups.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) trace_android_vh_shrink_node_memcgs(memcg, &skip);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) if (skip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) mem_cgroup_calculate_protection(target_memcg, memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) if (mem_cgroup_below_min(memcg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) * Hard protection.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) * If there is no reclaimable memory, OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) } else if (mem_cgroup_below_low(memcg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) * Soft protection.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) * Respect the protection only as long as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) * there is an unprotected supply
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) * of reclaimable memory from other cgroups.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) if (!sc->memcg_low_reclaim) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) sc->memcg_low_skipped = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) memcg_memory_event(memcg, MEMCG_LOW);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) reclaimed = sc->nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) scanned = sc->nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) shrink_lruvec(lruvec, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) sc->priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) /* Record the group's reclaim efficiency */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) vmpressure(sc->gfp_mask, memcg, false,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) sc->nr_scanned - scanned,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) sc->nr_reclaimed - reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) struct reclaim_state *reclaim_state = current->reclaim_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) unsigned long nr_reclaimed, nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) struct lruvec *target_lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) bool reclaimable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) unsigned long file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) again:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) memset(&sc->nr, 0, sizeof(sc->nr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) nr_reclaimed = sc->nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) nr_scanned = sc->nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) * Determine the scan balance between anon and file LRUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) sc->anon_cost = target_lruvec->anon_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) sc->file_cost = target_lruvec->file_cost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) * Target desirable inactive:active list ratios for the anon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) * and file LRU lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) if (!sc->force_deactivate) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) unsigned long refaults;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) refaults = lruvec_page_state(target_lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) WORKINGSET_ACTIVATE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) if (refaults != target_lruvec->refaults[0] ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) sc->may_deactivate |= DEACTIVATE_ANON;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) sc->may_deactivate &= ~DEACTIVATE_ANON;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) * When refaults are being observed, it means a new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) * workingset is being established. Deactivate to get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) * rid of any stale active pages quickly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) refaults = lruvec_page_state(target_lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) WORKINGSET_ACTIVATE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) if (refaults != target_lruvec->refaults[1] ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) sc->may_deactivate |= DEACTIVATE_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) sc->may_deactivate &= ~DEACTIVATE_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) * If we have plenty of inactive file pages that aren't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) * thrashing, try to reclaim those first before touching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) * anonymous pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) sc->cache_trim_mode = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) sc->cache_trim_mode = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) * Prevent the reclaimer from falling into the cache trap: as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) * cache pages start out inactive, every cache fault will tip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) * the scan balance towards the file LRU. And as the file LRU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) * shrinks, so does the window for rotation from references.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) * This means we have a runaway feedback loop where a tiny
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) * thrashing file LRU becomes infinitely more attractive than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) * anon pages. Try to detect this based on file LRU size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) if (!cgroup_reclaim(sc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) unsigned long total_high_wmark = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) unsigned long free, anon;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) int z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) file = node_page_state(pgdat, NR_ACTIVE_FILE) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) node_page_state(pgdat, NR_INACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) for (z = 0; z < MAX_NR_ZONES; z++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) struct zone *zone = &pgdat->node_zones[z];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) total_high_wmark += high_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) * Consider anon: if that's low too, this isn't a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) * runaway file reclaim problem, but rather just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) * extreme pressure. Reclaim as per usual then.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) anon = node_page_state(pgdat, NR_INACTIVE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) sc->file_is_tiny =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) file + free <= total_high_wmark &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) !(sc->may_deactivate & DEACTIVATE_ANON) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) anon >> sc->priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) shrink_node_memcgs(pgdat, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) if (reclaim_state) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) sc->nr_reclaimed += reclaim_state->reclaimed_slab;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) reclaim_state->reclaimed_slab = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) /* Record the subtree's reclaim efficiency */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) sc->nr_scanned - nr_scanned,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) sc->nr_reclaimed - nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) if (sc->nr_reclaimed - nr_reclaimed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) reclaimable = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) if (current_is_kswapd()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) * If reclaim is isolating dirty pages under writeback,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) * it implies that the long-lived page allocation rate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) * is exceeding the page laundering rate. Either the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) * global limits are not being effective at throttling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) * processes due to the page distribution throughout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) * zones or there is heavy usage of a slow backing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) * device. The only option is to throttle from reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) * context which is not ideal as there is no guarantee
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) * the dirtying process is throttled in the same way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) * balance_dirty_pages() manages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) * Once a node is flagged PGDAT_WRITEBACK, kswapd will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) * count the number of pages under pages flagged for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) * immediate reclaim and stall if any are encountered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) * in the nr_immediate check below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) set_bit(PGDAT_WRITEBACK, &pgdat->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) /* Allow kswapd to start writing pages during reclaim.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) if (sc->nr.unqueued_dirty == sc->nr.file_taken)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) set_bit(PGDAT_DIRTY, &pgdat->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) * If kswapd scans pages marked for immediate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) * reclaim and under writeback (nr_immediate), it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) * implies that pages are cycling through the LRU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) * faster than they are written so also forcibly stall.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) if (sc->nr.immediate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) congestion_wait(BLK_RW_ASYNC, HZ/10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) * Tag a node/memcg as congested if all the dirty pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) * scanned were backed by a congested BDI and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) * wait_iff_congested will stall.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) * Legacy memcg will stall in page writeback so avoid forcibly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) * stalling in wait_iff_congested().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) if ((current_is_kswapd() ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) * Stall direct reclaim for IO completions if underlying BDIs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) * and node is congested. Allow kswapd to continue until it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) * starts encountering unqueued dirty pages or cycling through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) * the LRU too quickly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) if (!current_is_kswapd() && current_may_throttle() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) !sc->hibernation_mode &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) wait_iff_congested(BLK_RW_ASYNC, HZ/10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) goto again;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) * Kswapd gives up on balancing particular nodes after too
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) * many failures to reclaim anything from them and goes to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) * sleep. On reclaim progress, reset the failure counter. A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) * successful direct reclaim run will revive a dormant kswapd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) if (reclaimable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) pgdat->kswapd_failures = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) * Returns true if compaction should go ahead for a costly-order request, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) * the allocation would already succeed without compaction. Return false if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) * should reclaim first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) unsigned long watermark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) enum compact_result suitable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) if (suitable == COMPACT_SUCCESS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) /* Allocation should succeed already. Don't reclaim. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) if (suitable == COMPACT_SKIPPED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) /* Compaction cannot yet proceed. Do reclaim. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) * Compaction is already possible, but it takes time to run and there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) * are potentially other callers using the pages just freed. So proceed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) * with reclaim to make a buffer of free pages available to give
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) * compaction a reasonable chance of completing and allocating the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) * Note that we won't actually reclaim the whole buffer in one attempt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) * as the target watermark in should_continue_reclaim() is lower. But if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) * we are already above the high+gap watermark, don't reclaim at all.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) watermark = high_wmark_pages(zone) + compact_gap(sc->order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) * This is the direct reclaim path, for page-allocating processes. We only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) * try to reclaim pages from zones which will satisfy the caller's allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) * request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) * If a zone is deemed to be full of pinned pages then just give it a light
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) * scan then give up on it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) unsigned long nr_soft_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) unsigned long nr_soft_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) gfp_t orig_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) pg_data_t *last_pgdat = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) * If the number of buffer_heads in the machine exceeds the maximum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) * allowed level, force direct reclaim to scan the highmem zone as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) * highmem pages could be pinning lowmem pages storing buffer_heads
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) orig_mask = sc->gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) if (buffer_heads_over_limit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) sc->gfp_mask |= __GFP_HIGHMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) sc->reclaim_idx = gfp_zone(sc->gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) for_each_zone_zonelist_nodemask(zone, z, zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) sc->reclaim_idx, sc->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) * Take care memory controller reclaiming has small influence
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) * to global LRU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) if (!cgroup_reclaim(sc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) if (!cpuset_zone_allowed(zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) GFP_KERNEL | __GFP_HARDWALL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) * If we already have plenty of memory free for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) * compaction in this zone, don't free any more.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) * Even though compaction is invoked for any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) * non-zero order, only frequent costly order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) * reclamation is disruptive enough to become a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) * noticeable problem, like transparent huge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) * page allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) if (IS_ENABLED(CONFIG_COMPACTION) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) sc->order > PAGE_ALLOC_COSTLY_ORDER &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) compaction_ready(zone, sc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) sc->compaction_ready = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) * Shrink each node in the zonelist once. If the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) * zonelist is ordered by zone (not the default) then a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) * node may be shrunk multiple times but in that case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) * the user prefers lower zones being preserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) if (zone->zone_pgdat == last_pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) * This steals pages from memory cgroups over softlimit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) * and returns the number of reclaimed pages and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) * scanned pages. This works for global memory pressure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) * and balancing, not for a memcg's limit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) nr_soft_scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) sc->order, sc->gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) &nr_soft_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) sc->nr_reclaimed += nr_soft_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) sc->nr_scanned += nr_soft_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) /* need some check for avoid more shrink_zone() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) /* See comment about same check for global reclaim above */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) if (zone->zone_pgdat == last_pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) last_pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) shrink_node(zone->zone_pgdat, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) * Restore to original mask to avoid the impact on the caller if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) * promoted it to __GFP_HIGHMEM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) sc->gfp_mask = orig_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) struct lruvec *target_lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) unsigned long refaults;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) target_lruvec->refaults[0] = refaults;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) target_lruvec->refaults[1] = refaults;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) trace_android_vh_snapshot_refaults(target_lruvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) * This is the main entry point to direct page reclaim.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) * If a full scan of the inactive list fails to free enough memory then we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) * are "out of memory" and something needs to be killed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) * If the caller is !__GFP_FS then the probability of a failure is reasonably
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) * high - the zone may be full of dirty or under-writeback pages, which this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) * caller can't do much about. We kick the writeback threads and take explicit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) * naps in the hope that some of these pages can be written. But if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) * allocating task holds filesystem locks which prevent writeout this might not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) * work, and the allocation attempt will fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) * returns: 0, if no pages reclaimed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) * else, the number of pages reclaimed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) int initial_priority = sc->priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) pg_data_t *last_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) delayacct_freepages_start();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) if (!cgroup_reclaim(sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) sc->priority);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) sc->nr_scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) shrink_zones(zonelist, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) if (sc->nr_reclaimed >= sc->nr_to_reclaim)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) if (sc->compaction_ready)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) * If we're getting trouble reclaiming, start doing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) * writepage even in laptop mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) if (sc->priority < DEF_PRIORITY - 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) sc->may_writepage = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) } while (--sc->priority >= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) last_pgdat = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) sc->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) if (zone->zone_pgdat == last_pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) last_pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) if (cgroup_reclaim(sc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) zone->zone_pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) delayacct_freepages_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) if (sc->nr_reclaimed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) return sc->nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) /* Aborted reclaim to try compaction? don't OOM, then */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) if (sc->compaction_ready)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) * We make inactive:active ratio decisions based on the node's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) * composition of memory, but a restrictive reclaim_idx or a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) * memory.low cgroup setting can exempt large amounts of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) * memory from reclaim. Neither of which are very common, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) * instead of doing costly eligibility calculations of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) * entire cgroup subtree up front, we assume the estimates are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) * good, and retry with forcible deactivation if that fails.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) if (sc->skipped_deactivate) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) sc->priority = initial_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) sc->force_deactivate = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) sc->skipped_deactivate = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) /* Untapped cgroup reserves? Don't OOM, retry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) if (sc->memcg_low_skipped) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) sc->priority = initial_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) sc->force_deactivate = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) sc->memcg_low_reclaim = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) sc->memcg_low_skipped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) static bool allow_direct_reclaim(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) unsigned long pfmemalloc_reserve = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) unsigned long free_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) bool wmark_ok;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) for (i = 0; i <= ZONE_NORMAL; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) zone = &pgdat->node_zones[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) if (!zone_reclaimable_pages(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) pfmemalloc_reserve += min_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) free_pages += zone_page_state(zone, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) /* If there are no reserves (unexpected config) then do not throttle */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) if (!pfmemalloc_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) wmark_ok = free_pages > pfmemalloc_reserve / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) /* kswapd must be awake if processes are being throttled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) wake_up_interruptible(&pgdat->kswapd_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) return wmark_ok;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) * Throttle direct reclaimers if backing storage is backed by the network
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) * and the PFMEMALLOC reserve for the preferred node is getting dangerously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) * depleted. kswapd will continue to make progress and wake the processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) * when the low watermark is reached.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) * Returns true if a fatal signal was delivered during throttling. If this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) * happens, the page allocator should not consider triggering the OOM killer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) pg_data_t *pgdat = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) * Kernel threads should not be throttled as they may be indirectly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) * responsible for cleaning pages necessary for reclaim to make forward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) * progress. kjournald for example may enter direct reclaim while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) * committing a transaction where throttling it could forcing other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) * processes to block on log_wait_commit().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) if (current->flags & PF_KTHREAD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) * If a fatal signal is pending, this process should not throttle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) * It should return quickly so it can exit and free its memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) * Check if the pfmemalloc reserves are ok by finding the first node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) * with a usable ZONE_NORMAL or lower zone. The expectation is that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) * GFP_KERNEL will be required for allocating network buffers when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) * swapping over the network so ZONE_HIGHMEM is unusable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) * Throttling is based on the first usable node and throttled processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) * wait on a queue until kswapd makes progress and wakes them. There
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) * is an affinity then between processes waking up and where reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) * progress has been made assuming the process wakes on the same node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) * More importantly, processes running on remote nodes will not compete
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) * for remote pfmemalloc reserves and processes on different nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) * should make reasonable progress.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) for_each_zone_zonelist_nodemask(zone, z, zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) gfp_zone(gfp_mask), nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) if (zone_idx(zone) > ZONE_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) /* Throttle based on the first usable node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) if (allow_direct_reclaim(pgdat))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) /* If no zone was usable by the allocation flags then do not throttle */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) if (!pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) /* Account for the throttling */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) count_vm_event(PGSCAN_DIRECT_THROTTLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) * If the caller cannot enter the filesystem, it's possible that it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) * is due to the caller holding an FS lock or performing a journal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) * transaction in the case of a filesystem like ext[3|4]. In this case,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) * it is not safe to block on pfmemalloc_wait as kswapd could be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) * blocked waiting on the same lock. Instead, throttle for up to a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) * second before continuing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) if (!(gfp_mask & __GFP_FS)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) allow_direct_reclaim(pgdat), HZ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) goto check_pending;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) /* Throttle until kswapd wakes the process */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) allow_direct_reclaim(pgdat));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) check_pending:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) gfp_t gfp_mask, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) unsigned long nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) .nr_to_reclaim = SWAP_CLUSTER_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) .gfp_mask = current_gfp_context(gfp_mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) .reclaim_idx = gfp_zone(gfp_mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) .order = order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) .nodemask = nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) .priority = DEF_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) .may_writepage = !laptop_mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) .may_swap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) * scan_control uses s8 fields for order, priority, and reclaim_idx.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) * Confirm they are large enough for max values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) BUILD_BUG_ON(MAX_ORDER > S8_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) * Do not enter reclaim if fatal signal was delivered while throttled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) * 1 is returned so that the page allocator does not OOM kill at this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) * point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) set_task_reclaim_state(current, &sc.reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) set_task_reclaim_state(current, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) /* Only used by soft limit reclaim. Do not reuse for anything else. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) gfp_t gfp_mask, bool noswap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) pg_data_t *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) unsigned long *nr_scanned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) .nr_to_reclaim = SWAP_CLUSTER_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) .target_mem_cgroup = memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) .may_writepage = !laptop_mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) .reclaim_idx = MAX_NR_ZONES - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) .may_swap = !noswap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) WARN_ON_ONCE(!current->reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) * NOTE: Although we can get the priority field, using it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) * here is not a good idea, since it limits the pages we can scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) * if we don't reclaim here, the shrink_node from balance_pgdat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) * will pick up pages from other mem cgroup's as well. We hack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) * the priority and make it zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) shrink_lruvec(lruvec, &sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) *nr_scanned = sc.nr_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) return sc.nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) bool may_swap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) unsigned long nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) unsigned int noreclaim_flag;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) .reclaim_idx = MAX_NR_ZONES - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) .target_mem_cgroup = memcg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) .priority = DEF_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) .may_writepage = !laptop_mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) .may_swap = may_swap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) * equal pressure on all the nodes. This is based on the assumption that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) * the reclaim does not bail out early.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) set_task_reclaim_state(current, &sc.reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) noreclaim_flag = memalloc_noreclaim_save();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) memalloc_noreclaim_restore(noreclaim_flag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) set_task_reclaim_state(current, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) static void age_active_anon(struct pglist_data *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) struct mem_cgroup *memcg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) if (!total_swap_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) lruvec = mem_cgroup_lruvec(NULL, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) memcg = mem_cgroup_iter(NULL, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) lruvec = mem_cgroup_lruvec(memcg, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) sc, LRU_ACTIVE_ANON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) memcg = mem_cgroup_iter(NULL, memcg, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) } while (memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) * Check for watermark boosts top-down as the higher zones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) * are more likely to be boosted. Both watermarks and boosts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) * should not be checked at the same time as reclaim would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) * start prematurely when there is no boosting and a lower
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) * zone is balanced.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) for (i = highest_zoneidx; i >= 0; i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) if (zone->watermark_boost)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) * Returns true if there is an eligible zone balanced for the request order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) * and highest_zoneidx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) unsigned long mark = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) * Check watermarks bottom-up as lower zones are more likely to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) * meet watermarks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) for (i = 0; i <= highest_zoneidx; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) mark = high_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) * If a node has no populated zone within highest_zoneidx, it does not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) * need balancing by definition. This can happen if a zone-restricted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) * allocation tries to wake a remote kswapd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) if (mark == -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) /* Clear pgdat state for congested, dirty or under writeback. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) static void clear_pgdat_congested(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) clear_bit(PGDAT_DIRTY, &pgdat->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) * Prepare kswapd for sleeping. This verifies that there are no processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) * waiting in throttle_direct_reclaim() and that watermarks have been met.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) * Returns true if kswapd is ready to sleep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) int highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) * The throttled processes are normally woken up in balance_pgdat() as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) * soon as allow_direct_reclaim() is true. But there is a potential
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) * race between when kswapd checks the watermarks and a process gets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) * throttled. There is also a potential race if processes get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) * throttled, kswapd wakes, a large process exits thereby balancing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) * zones, which causes kswapd to exit balance_pgdat() before reaching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) * the wake up checks. If kswapd is going to sleep, no process should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) * the wake up is premature, processes will wake kswapd and get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) * throttled again. The difference from wake ups in balance_pgdat() is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) * that here we are under prepare_to_wait().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) if (waitqueue_active(&pgdat->pfmemalloc_wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) wake_up_all(&pgdat->pfmemalloc_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) /* Hopeless node, leave it to direct reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) clear_pgdat_congested(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) * kswapd shrinks a node of pages that are at or below the highest usable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) * zone that is currently unbalanced.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) * Returns true if kswapd scanned at least the requested number of pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) * reclaim or if the lack of progress was due to pages under writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) * This is used to determine if the scanning priority needs to be raised.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) static bool kswapd_shrink_node(pg_data_t *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) struct scan_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) int z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) /* Reclaim a number of pages proportional to the number of zones */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) sc->nr_to_reclaim = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) for (z = 0; z <= sc->reclaim_idx; z++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) zone = pgdat->node_zones + z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) * Historically care was taken to put equal pressure on all zones but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) * now pressure is applied based on node LRU order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) shrink_node(pgdat, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) * Fragmentation may mean that the system cannot be rebalanced for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) * high-order allocations. If twice the allocation size has been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) * reclaimed then recheck watermarks only at order-0 to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) * excessive reclaim. Assume that a process requested a high-order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) * can direct reclaim/compact.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) sc->order = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) return sc->nr_scanned >= sc->nr_to_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) * For kswapd, balance_pgdat() will reclaim pages across a node from zones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) * that are eligible for use by the caller until at least one zone is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) * balanced.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) * Returns the order kswapd finished reclaiming at.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) * kswapd scans the zones in the highmem->normal->dma direction. It skips
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) * zones which have free_pages > high_wmark_pages(zone), but once a zone is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) * found to have free_pages <= high_wmark_pages(zone), any page in that zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) * or lower is eligible for reclaim until at least one usable zone is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) * balanced.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) unsigned long nr_soft_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) unsigned long nr_soft_scanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) unsigned long pflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) unsigned long nr_boost_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) bool boosted;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) .gfp_mask = GFP_KERNEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) .order = order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) set_task_reclaim_state(current, &sc.reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) psi_memstall_enter(&pflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) __fs_reclaim_acquire();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) count_vm_event(PAGEOUTRUN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) * Account for the reclaim boost. Note that the zone boost is left in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) * place so that parallel allocations that are near the watermark will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) * stall or direct reclaim until kswapd is finished.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) nr_boost_reclaim = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) for (i = 0; i <= highest_zoneidx; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) nr_boost_reclaim += zone->watermark_boost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) zone_boosts[i] = zone->watermark_boost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) boosted = nr_boost_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) sc.priority = DEF_PRIORITY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) unsigned long nr_reclaimed = sc.nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) bool raise_priority = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) bool balanced;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) bool ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) sc.reclaim_idx = highest_zoneidx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) * If the number of buffer_heads exceeds the maximum allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) * then consider reclaiming from all zones. This has a dual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) * purpose -- on 64-bit systems it is expected that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) * buffer_heads are stripped during active rotation. On 32-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) * systems, highmem pages can pin lowmem memory and shrinking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) * buffers can relieve lowmem pressure. Reclaim may still not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) * go ahead if all eligible zones for the original allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) * request are balanced to avoid excessive reclaim from kswapd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) if (buffer_heads_over_limit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) sc.reclaim_idx = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) * If the pgdat is imbalanced then ignore boosting and preserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) * the watermarks for a later time and restart. Note that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) * zone watermarks will be still reset at the end of balancing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) * on the grounds that the normal reclaim should be enough to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) * re-evaluate if boosting is required when kswapd next wakes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) if (!balanced && nr_boost_reclaim) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) nr_boost_reclaim = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) * If boosting is not active then only reclaim if there are no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) * eligible zones. Note that sc.reclaim_idx is not used as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) * buffer_heads_over_limit may have adjusted it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) if (!nr_boost_reclaim && balanced)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) /* Limit the priority of boosting to avoid reclaim writeback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729) if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) raise_priority = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) * Do not writeback or swap pages for boosted reclaim. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) * intent is to relieve pressure not issue sub-optimal IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) * from reclaim context. If no pages are reclaimed, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) * reclaim will be aborted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) sc.may_swap = !nr_boost_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) * Do some background aging of the anon list, to give
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) * pages a chance to be referenced before reclaiming. All
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) * pages are rotated regardless of classzone as this is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) * about consistent aging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) age_active_anon(pgdat, &sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) * If we're getting trouble reclaiming, start doing writepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) * even in laptop mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753) if (sc.priority < DEF_PRIORITY - 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) sc.may_writepage = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756) /* Call soft limit reclaim before calling shrink_node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) sc.nr_scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758) nr_soft_scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) sc.gfp_mask, &nr_soft_scanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761) sc.nr_reclaimed += nr_soft_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) * There should be no need to raise the scanning priority if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) * enough pages are already being scanned that that high
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) * watermark would be met at 100% efficiency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) if (kswapd_shrink_node(pgdat, &sc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) raise_priority = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) * If the low watermark is met there is no need for processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) * to be throttled on pfmemalloc_wait as they should not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) * able to safely make forward progress. Wake them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) allow_direct_reclaim(pgdat))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) wake_up_all(&pgdat->pfmemalloc_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) /* Check if kswapd should be suspending */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) __fs_reclaim_release();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) ret = try_to_freeze();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) __fs_reclaim_acquire();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) if (ret || kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) * Raise priority if scanning rate is too low or there was no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) * progress in reclaiming pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) * If reclaim made no progress for a boost, stop reclaim as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) * IO cannot be queued and it could be an infinite loop in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) * extreme circumstances.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) if (nr_boost_reclaim && !nr_reclaimed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) if (raise_priority || !nr_reclaimed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) sc.priority--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) } while (sc.priority >= 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) if (!sc.nr_reclaimed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807) pgdat->kswapd_failures++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) /* If reclaim was boosted, account for the reclaim done in this pass */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811) if (boosted) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) for (i = 0; i <= highest_zoneidx; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) if (!zone_boosts[i])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) /* Increments are under the zone lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) * As there is now likely space, wakeup kcompact to defragment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) * pageblocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) snapshot_refaults(NULL, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) __fs_reclaim_release();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) psi_memstall_leave(&pflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) set_task_reclaim_state(current, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838) * Return the order kswapd stopped reclaiming at as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) * prepare_kswapd_sleep() takes it into account. If another caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) * entered the allocator slow path while kswapd was awake, order will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) * remain at the higher level.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) return sc.order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) * not a valid index then either kswapd runs for first time or kswapd couldn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) * sleep after previous reclaim attempt (node is still unbalanced). In that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) * case return the zone index of the previous kswapd reclaim cycle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) enum zone_type prev_highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) unsigned int highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) long remaining = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) DEFINE_WAIT(wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) if (freezing(current) || kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) * Try to sleep for a short interval. Note that kcompactd will only be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) * woken if it is possible to sleep for a short interval. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) * deliberate on the assumption that if reclaim cannot keep an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) * eligible zone balanced that it's also unlikely that compaction will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) * succeed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) * Compaction records what page blocks it recently failed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) * isolate pages from and skips them in the future scanning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) * When kswapd is going to sleep, it is reasonable to assume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) * that pages and compaction may succeed so reset the cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) reset_isolation_suitable(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) * We have freed the memory, now we should compact it to make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) * allocation of the requested order possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) remaining = schedule_timeout(HZ/10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) * If woken prematurely then reset kswapd_highest_zoneidx and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) * order. The values will either be from a wakeup request or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) * the previous request that slept prematurely.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) if (remaining) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) kswapd_highest_zoneidx(pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) highest_zoneidx));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) finish_wait(&pgdat->kswapd_wait, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) * After a short sleep, check if it was a premature sleep. If not, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) * go fully to sleep until explicitly woken up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) if (!remaining &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) * vmstat counters are not perfectly accurate and the estimated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) * value for counters such as NR_FREE_PAGES can deviate from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) * true value by nr_online_cpus * threshold. To avoid the zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) * watermarks being breached while under pressure, we reduce the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) * per-cpu vmstat threshold while kswapd is awake and restore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) * them before going back to sleep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) if (!kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) if (remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) finish_wait(&pgdat->kswapd_wait, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) * The background pageout daemon, started as a kernel thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) * from the init process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) * This basically trickles out pages so that we have _some_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) * free memory available even if there is no other activity
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) * that frees anything up. This is needed for things like routing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) * etc, where we otherwise might have all activity going on in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) * asynchronous contexts that cannot page things out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) * If there are applications that are active memory-allocators
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) * (most normal use), this basically shouldn't matter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) static int kswapd(void *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) unsigned int alloc_order, reclaim_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) pg_data_t *pgdat = (pg_data_t*)p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) struct task_struct *tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966) if (!cpumask_empty(cpumask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) set_cpus_allowed_ptr(tsk, cpumask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) * Tell the memory management that we're a "memory allocator",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) * and that if we need more memory we should get access to it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) * regardless (see "__alloc_pages()"). "kswapd" should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) * never get caught in the normal page freeing logic.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) * (Kswapd normally doesn't need memory anyway, but sometimes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) * you need a small amount of memory in order to be able to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) * page out something else, and this flag essentially protects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) * us from recursively trying to free more memory as we're
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) * trying to free the first piece of memory in the first place).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) set_freezable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) WRITE_ONCE(pgdat->kswapd_order, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) for ( ; ; ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) bool ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) highest_zoneidx = kswapd_highest_zoneidx(pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) kswapd_try_sleep:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) /* Read the new order and highest_zoneidx */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) highest_zoneidx = kswapd_highest_zoneidx(pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) WRITE_ONCE(pgdat->kswapd_order, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004) ret = try_to_freeze();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) if (kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) * We can speed up thawing tasks if we don't call balance_pgdat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) * after returning from the refrigerator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) * Reclaim begins at the requested order but if a high-order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) * reclaim fails then kswapd falls back to reclaiming for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) * order-0. If that happens, kswapd will consider sleeping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) * for the order it finished reclaiming at (reclaim_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) * but kcompactd is woken to compact for the original
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) * request (alloc_order).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) alloc_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) reclaim_order = balance_pgdat(pgdat, alloc_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) if (reclaim_order < alloc_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) goto kswapd_try_sleep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031) tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) static int kswapd_per_node_run(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) int hid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) for (hid = 0; hid < kswapd_threads; ++hid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) nid, hid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) if (IS_ERR(pgdat->mkswapd[hid])) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) /* failure at boot is fatal */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) WARN_ON(system_state < SYSTEM_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) pr_err("Failed to start kswapd%d on node %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) hid, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) ret = PTR_ERR(pgdat->mkswapd[hid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) pgdat->mkswapd[hid] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) if (!pgdat->kswapd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) pgdat->kswapd = pgdat->mkswapd[hid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) static void kswapd_per_node_stop(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) int hid = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) struct task_struct *kswapd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) for (hid = 0; hid < kswapd_threads; hid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) kswapd = NODE_DATA(nid)->mkswapd[hid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068) if (kswapd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) kthread_stop(kswapd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) NODE_DATA(nid)->mkswapd[hid] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) NODE_DATA(nid)->kswapd = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) * A zone is low on free memory or too fragmented for high-order memory. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) * has failed or is not needed, still wake up kcompactd if only compaction is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) * needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) enum zone_type highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) enum zone_type curr_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) if (!managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) if (!cpuset_zone_allowed(zone, gfp_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) if (READ_ONCE(pgdat->kswapd_order) < order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) WRITE_ONCE(pgdat->kswapd_order, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) if (!waitqueue_active(&pgdat->kswapd_wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) /* Hopeless node, leave it to direct reclaim if possible */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) (pgdat_balanced(pgdat, order, highest_zoneidx) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) * There may be plenty of free memory available, but it's too
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) * fragmented for high-order allocations. Wake up kcompactd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) * and rely on compaction_suitable() to determine if it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) * needed. If it fails, it will defer subsequent attempts to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) * ratelimit its work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) wakeup_kcompactd(pgdat, order, highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124) gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) wake_up_interruptible(&pgdat->kswapd_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) #ifdef CONFIG_HIBERNATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) * freed pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) * Rather than trying to age LRUs the aim is to preserve the overall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) * LRU order by reclaiming preferentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) * inactive > active > active referenced > active mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137) unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) .nr_to_reclaim = nr_to_reclaim,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141) .gfp_mask = GFP_HIGHUSER_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) .reclaim_idx = MAX_NR_ZONES - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) .priority = DEF_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) .may_writepage = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) .may_unmap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) .may_swap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) .hibernation_mode = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150) unsigned long nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) unsigned int noreclaim_flag;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) fs_reclaim_acquire(sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) noreclaim_flag = memalloc_noreclaim_save();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) set_task_reclaim_state(current, &sc.reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) set_task_reclaim_state(current, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) memalloc_noreclaim_restore(noreclaim_flag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) fs_reclaim_release(sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) return nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) #endif /* CONFIG_HIBERNATION */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) * This kswapd start function will be called by init and node-hot-add.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169) * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) int kswapd_run(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) if (pgdat->kswapd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) if (kswapd_threads > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) return kswapd_per_node_run(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) if (IS_ERR(pgdat->kswapd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) /* failure at boot is fatal */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185) BUG_ON(system_state < SYSTEM_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) pr_err("Failed to start kswapd on node %d\n", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187) ret = PTR_ERR(pgdat->kswapd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) pgdat->kswapd = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194) * Called by memory hotplug when all memory in a node is offlined. Caller must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) * hold mem_hotplug_begin/end().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) void kswapd_stop(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) if (kswapd_threads > 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) kswapd_per_node_stop(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) if (kswapd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207) kthread_stop(kswapd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) NODE_DATA(nid)->kswapd = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) static int __init kswapd_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) swap_setup();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) for_each_node_state(nid, N_MEMORY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) kswapd_run(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) module_init(kswapd_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) * Node reclaim mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) * If non-zero call node_reclaim when the number of free pages falls below
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) * the watermarks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) int node_reclaim_mode __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234) * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) * ABI. New bits are OK, but existing bits can never change.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238) #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) * Priority for NODE_RECLAIM. This determines the fraction of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) * of a node considered for each zone_reclaim. 4 scans 1/16th of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) * a zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) #define NODE_RECLAIM_PRIORITY 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) * Percentage of pages in a zone that must be unmapped for node_reclaim to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4250) * occur.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4251) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4252) int sysctl_min_unmapped_ratio = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4254) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4255) * If the number of slab pages in a zone grows beyond this percentage then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4256) * slab reclaim needs to occur.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4257) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4258) int sysctl_min_slab_ratio = 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4260) static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4261) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4262) unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4263) unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4264) node_page_state(pgdat, NR_ACTIVE_FILE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4266) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4267) * It's possible for there to be more file mapped pages than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4268) * accounted for by the pages on the file LRU lists because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4269) * tmpfs pages accounted for as ANON can also be FILE_MAPPED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4270) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4271) return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4274) /* Work out how many page cache pages we can reclaim in this reclaim_mode */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4275) static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4277) unsigned long nr_pagecache_reclaimable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4278) unsigned long delta = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4280) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4281) * If RECLAIM_UNMAP is set, then all file pages are considered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4282) * potentially reclaimable. Otherwise, we have to worry about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4283) * pages like swapcache and node_unmapped_file_pages() provides
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4284) * a better estimate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4285) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4286) if (node_reclaim_mode & RECLAIM_UNMAP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4287) nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4288) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4289) nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4291) /* If we can't clean pages, remove dirty pages from consideration */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4292) if (!(node_reclaim_mode & RECLAIM_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4293) delta += node_page_state(pgdat, NR_FILE_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4295) /* Watch for any possible underflows due to delta */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4296) if (unlikely(delta > nr_pagecache_reclaimable))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4297) delta = nr_pagecache_reclaimable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4299) return nr_pagecache_reclaimable - delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4302) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4303) * Try to free up some pages from this node through reclaim.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4304) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4305) static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4306) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4307) /* Minimum pages needed in order to stay on node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4308) const unsigned long nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4309) struct task_struct *p = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4310) unsigned int noreclaim_flag;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4311) struct scan_control sc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4312) .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4313) .gfp_mask = current_gfp_context(gfp_mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4314) .order = order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4315) .priority = NODE_RECLAIM_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4316) .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4317) .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4318) .may_swap = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4319) .reclaim_idx = gfp_zone(gfp_mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4320) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4322) trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4323) sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4325) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4326) fs_reclaim_acquire(sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4327) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4328) * We need to be able to allocate from the reserves for RECLAIM_UNMAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4329) * and we also need to be able to write out pages for RECLAIM_WRITE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4330) * and RECLAIM_UNMAP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4331) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4332) noreclaim_flag = memalloc_noreclaim_save();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4333) p->flags |= PF_SWAPWRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4334) set_task_reclaim_state(p, &sc.reclaim_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4336) if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4337) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4338) * Free memory by calling shrink node with increasing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4339) * priorities until we have enough memory freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4340) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4341) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4342) shrink_node(pgdat, &sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4343) } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4346) set_task_reclaim_state(p, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4347) current->flags &= ~PF_SWAPWRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4348) memalloc_noreclaim_restore(noreclaim_flag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4349) fs_reclaim_release(sc.gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4351) trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4353) return sc.nr_reclaimed >= nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4356) int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4357) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4358) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4361) * Node reclaim reclaims unmapped file backed pages and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4362) * slab pages if we are over the defined limits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4363) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4364) * A small portion of unmapped file backed pages is needed for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4365) * file I/O otherwise pages read by file I/O will be immediately
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4366) * thrown out if the node is overallocated. So we do not reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4367) * if less than a specified percentage of the node is used by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4368) * unmapped file backed pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4369) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4370) if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4371) node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4372) pgdat->min_slab_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4373) return NODE_RECLAIM_FULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4375) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4376) * Do not scan if the allocation should not be delayed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4377) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4378) if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4379) return NODE_RECLAIM_NOSCAN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4381) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4382) * Only run node reclaim on the local node or on nodes that do not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4383) * have associated processors. This will favor the local processor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4384) * over remote processors and spread off node memory allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4385) * as wide as possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4386) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4387) if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4388) return NODE_RECLAIM_NOSCAN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4390) if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4391) return NODE_RECLAIM_NOSCAN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4393) ret = __node_reclaim(pgdat, gfp_mask, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4394) clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4396) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4397) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4399) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4400) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4401) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4403) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4404) * check_move_unevictable_pages - check pages for evictability and move to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4405) * appropriate zone lru list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4406) * @pvec: pagevec with lru pages to check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4407) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4408) * Checks pages for evictability, if an evictable page is in the unevictable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4409) * lru list, moves it to the appropriate evictable lru list. This function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4410) * should be only used for lru pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4411) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4412) void check_move_unevictable_pages(struct pagevec *pvec)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4414) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4415) struct pglist_data *pgdat = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4416) int pgscanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4417) int pgrescued = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4418) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4420) for (i = 0; i < pvec->nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4421) struct page *page = pvec->pages[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4422) struct pglist_data *pagepgdat = page_pgdat(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4423) int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4425) if (PageTransTail(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4426) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4428) nr_pages = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4429) pgscanned += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4431) if (pagepgdat != pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4432) if (pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4433) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4434) pgdat = pagepgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4435) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4437) lruvec = mem_cgroup_page_lruvec(page, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4439) if (!PageLRU(page) || !PageUnevictable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4440) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4442) if (page_evictable(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4443) enum lru_list lru = page_lru_base_type(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4445) VM_BUG_ON_PAGE(PageActive(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4446) ClearPageUnevictable(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4447) del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4448) add_page_to_lru_list(page, lruvec, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4449) pgrescued += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4451) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4453) if (pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4454) __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4455) __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4456) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4457) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4459) EXPORT_SYMBOL_GPL(check_move_unevictable_pages);