^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/mm/oom_kill.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 1998,2000 Rik van Riel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Thanks go out to Claus Fischer for some serious inspiration and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * for goading me into coding this file...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Copyright (C) 2010 Google, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * Rewritten by David Rientjes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * The routines in this file are used to kill a process when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * we're seriously out of memory. This gets called from __alloc_pages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * in mm/page_alloc.c when we really run out of memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Since we won't call these routines often (on a well-configured
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * machine) this file will double as a 'coding guide' and a signpost
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * for newbie kernel hackers. It features several pointers to major
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * kernel subsystems and hints as to where to find out what things do.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/err.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/gfp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/sched/coredump.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/sched/task.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/sched/debug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/timex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/jiffies.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/memcontrol.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/security.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/ptrace.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/freezer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/ftrace.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/ratelimit.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/kthread.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include "slab.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include <trace/events/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #undef CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #include <trace/hooks/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) int sysctl_panic_on_oom;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) int sysctl_oom_kill_allocating_task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) int sysctl_oom_dump_tasks = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * Serializes oom killer invocations (out_of_memory()) from all contexts to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * prevent from over eager oom killing (e.g. when the oom killer is invoked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * from different domains).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * and mark_oom_victim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) DEFINE_MUTEX(oom_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /* Serializes oom_score_adj and oom_score_adj_min updates */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) DEFINE_MUTEX(oom_adj_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) static inline bool is_memcg_oom(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) return oc->memcg != NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * oom_cpuset_eligible() - check task eligiblity for kill
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * @start: task struct of which task to consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * @oc: pointer to struct oom_control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * Task eligibility is determined by whether or not a candidate task, @tsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * shares the same mempolicy nodes as current if it is bound by such a policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * and whether or not it has the same set of allowed cpuset nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * This function is assuming oom-killer context and 'current' has triggered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * the oom-killer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) static bool oom_cpuset_eligible(struct task_struct *start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) const nodemask_t *mask = oc->nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (is_memcg_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) for_each_thread(start, tsk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) if (mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * If this is a mempolicy constrained oom, tsk's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * cpuset is irrelevant. Only return true if its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * mempolicy intersects current, otherwise it may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) * needlessly killed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) ret = mempolicy_nodemask_intersects(tsk, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * This is not a mempolicy constrained oom, so only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * check the mems of tsk's cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ret = cpuset_mems_allowed_intersects(current, tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) #endif /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) * The process p may have detached its own ->mm while exiting or through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * kthread_use_mm(), but one or more of its subthreads may still have a valid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * pointer. Return p, or any of its subthreads with a valid ->mm, with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * task_lock() held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) struct task_struct *find_lock_task_mm(struct task_struct *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) struct task_struct *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) for_each_thread(p, t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) task_lock(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) if (likely(t->mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) task_unlock(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) t = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) return t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) * order == -1 means the oom kill is required by sysrq, otherwise only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * for display purposes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) static inline bool is_sysrq_oom(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) return oc->order == -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) /* return true if the task is not adequate as candidate victim task. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) static bool oom_unkillable_task(struct task_struct *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) if (is_global_init(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) if (p->flags & PF_KTHREAD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * than all user memory (LRU pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) static bool is_dump_unreclaim_slabs(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) unsigned long nr_lru;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) global_node_page_state(NR_INACTIVE_ANON) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) global_node_page_state(NR_ACTIVE_FILE) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) global_node_page_state(NR_INACTIVE_FILE) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) global_node_page_state(NR_ISOLATED_ANON) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) global_node_page_state(NR_ISOLATED_FILE) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) global_node_page_state(NR_UNEVICTABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * oom_badness - heuristic function to determine which candidate task to kill
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) * @p: task struct of which task we should calculate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * @totalpages: total present RAM allowed for page allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) * The heuristic for determining which task to kill is made to be as simple and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) * predictable as possible. The goal is to return the highest value for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * task consuming the most memory to avoid subsequent oom failures.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) long oom_badness(struct task_struct *p, unsigned long totalpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) long points;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) long adj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) if (oom_unkillable_task(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) return LONG_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) p = find_lock_task_mm(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) if (!p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) return LONG_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * Do not even consider tasks which are explicitly marked oom
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * unkillable or have been already oom reaped or the are in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * the middle of vfork
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) adj = (long)p->signal->oom_score_adj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) if (adj == OOM_SCORE_ADJ_MIN ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) in_vfork(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) task_unlock(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) return LONG_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) * The baseline for the badness score is the proportion of RAM that each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) * task's rss, pagetable and swap space use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) mm_pgtables_bytes(p->mm) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) task_unlock(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) /* Normalize to oom_score_adj units */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) adj *= totalpages / 1000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) points += adj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) return points;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) static const char * const oom_constraint_text[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * Determine the type of allocation constraint.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) static enum oom_constraint constrained_alloc(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) bool cpuset_limited = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) if (is_memcg_oom(oc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) return CONSTRAINT_MEMCG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) /* Default to all available memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) oc->totalpages = totalram_pages() + total_swap_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (!IS_ENABLED(CONFIG_NUMA))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) return CONSTRAINT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (!oc->zonelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) return CONSTRAINT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * Reach here only when __GFP_NOFAIL is used. So, we should avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * to kill current.We have to random task kill in this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (oc->gfp_mask & __GFP_THISNODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) return CONSTRAINT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) * the page allocator means a mempolicy is in effect. Cpuset policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) * is enforced in get_page_from_freelist().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) if (oc->nodemask &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) oc->totalpages = total_swap_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) for_each_node_mask(nid, *oc->nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) oc->totalpages += node_present_pages(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) return CONSTRAINT_MEMORY_POLICY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) /* Check this allocation failure is caused by cpuset's wall function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) highest_zoneidx, oc->nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) if (!cpuset_zone_allowed(zone, oc->gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) cpuset_limited = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) if (cpuset_limited) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) oc->totalpages = total_swap_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) for_each_node_mask(nid, cpuset_current_mems_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) oc->totalpages += node_present_pages(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) return CONSTRAINT_CPUSET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) return CONSTRAINT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) static int oom_evaluate_task(struct task_struct *task, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) struct oom_control *oc = arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) long points;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) if (oom_unkillable_task(task))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) /* p may not have freeable memory in nodemask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) * This task already has access to memory reserves and is being killed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * Don't allow any other task to have access to the reserves unless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * the task has MMF_OOM_SKIP because chances that it would release
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) * any memory is quite low.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) goto abort;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) * If task is allocating a lot of memory and has been marked to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) * killed first if it triggers an oom, then select it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) if (oom_task_origin(task)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) points = LONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) goto select;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) points = oom_badness(task, oc->totalpages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) if (points == LONG_MIN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) * Check to see if this is the worst task with a non-negative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) * ADJ score seen so far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) if (task->signal->oom_score_adj >= 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) points > oc->chosen_non_negative_adj_points) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) if (oc->chosen_non_negative_adj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) put_task_struct(oc->chosen_non_negative_adj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) get_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) oc->chosen_non_negative_adj = task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) oc->chosen_non_negative_adj_points = points;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) if (points < oc->chosen_points)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) select:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) if (oc->chosen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) put_task_struct(oc->chosen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) get_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) oc->chosen = task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) oc->chosen_points = points;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) next:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) abort:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) if (oc->chosen_non_negative_adj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) put_task_struct(oc->chosen_non_negative_adj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) if (oc->chosen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) put_task_struct(oc->chosen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) oc->chosen_non_negative_adj = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) oc->chosen = (void *)-1UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) * Simple selection loop. We choose the process with the highest number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) * 'points'. In case scan was aborted, oc->chosen is set to -1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) static void select_bad_process(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) oc->chosen_points = LONG_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) oc->chosen_non_negative_adj_points = LONG_MIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) oc->chosen_non_negative_adj = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) if (is_memcg_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) struct task_struct *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) for_each_process(p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) if (oom_evaluate_task(p, oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) if (oc->chosen_non_negative_adj) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * If oc->chosen has a negative ADJ, and we found a task with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) * a postive ADJ to kill, kill the task with the positive ADJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) if (oc->chosen && oc->chosen->signal->oom_score_adj < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) put_task_struct(oc->chosen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) oc->chosen = oc->chosen_non_negative_adj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) oc->chosen_points = oc->chosen_non_negative_adj_points;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) put_task_struct(oc->chosen_non_negative_adj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) static int dump_task(struct task_struct *p, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) struct oom_control *oc = arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) if (oom_unkillable_task(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) /* p may not have freeable memory in nodemask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) task = find_lock_task_mm(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) if (!task) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) * This is a kthread or all of p's threads have already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) * detached their mm's. There's no need to report
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) * them; they can't be oom killed anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) task->pid, from_kuid(&init_user_ns, task_uid(task)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) mm_pgtables_bytes(task->mm),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) get_mm_counter(task->mm, MM_SWAPENTS),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) task->signal->oom_score_adj, task->comm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) task_unlock(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * dump_tasks - dump current memory state of all system tasks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * @oc: pointer to struct oom_control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * Dumps the current memory state of all eligible tasks. Tasks not in the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) * are not shown.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) * State information includes task's pid, uid, tgid, vm size, rss,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) * pgtables_bytes, swapents, oom_score_adj value, and name.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) static void dump_tasks(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) pr_info("Tasks state (memory values in pages):\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) if (is_memcg_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) struct task_struct *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) for_each_process(p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) dump_task(p, oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) /* one line summary of the oom killer context. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) oom_constraint_text[oc->constraint],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) nodemask_pr_args(oc->nodemask));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) cpuset_print_current_mems_allowed();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) mem_cgroup_print_oom_context(oc->memcg, victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) from_kuid(&init_user_ns, task_uid(victim)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) static void dump_header(struct oom_control *oc, struct task_struct *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) current->signal->oom_score_adj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) pr_warn("COMPACTION is disabled!!!\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) dump_stack();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) if (is_memcg_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) mem_cgroup_print_oom_meminfo(oc->memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) if (is_dump_unreclaim_slabs())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) dump_unreclaimable_slab();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) if (sysctl_oom_dump_tasks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) dump_tasks(oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) if (p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) dump_oom_summary(oc, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * Number of OOM victims in flight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) static atomic_t oom_victims = ATOMIC_INIT(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) static bool oom_killer_disabled __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) #define K(x) ((x) << (PAGE_SHIFT-10))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) * task->mm can be NULL if the task is the exited group leader. So to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * determine whether the task is using a particular mm, we examine all the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * task's threads: if one of those is using this mm then this task was also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * using it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) struct task_struct *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) for_each_thread(p, t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) struct mm_struct *t_mm = READ_ONCE(t->mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) if (t_mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) return t_mm == mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) #ifdef CONFIG_MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) * OOM Reaper kernel thread which tries to reap the memory used by the OOM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * victim (if that is possible) to help the OOM killer to move on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) static struct task_struct *oom_reaper_th;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) static struct task_struct *oom_reaper_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) static DEFINE_SPINLOCK(oom_reaper_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) bool __oom_reap_task_mm(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) bool ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) * Tell all users of get_user/copy_from_user etc... that the content
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) * is no longer stable. No barriers really needed because unmapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) * should imply barriers already and the reader would hit a page fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) * if it stumbled over a reaped memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) set_bit(MMF_UNSTABLE, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) for (vma = mm->mmap ; vma; vma = vma->vm_next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) if (!can_madv_lru_vma(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) * Only anonymous pages have a good chance to be dropped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) * without additional steps which we cannot afford as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * are OOM already.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) * We do not even care about fs backed pages because all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) * which are reclaimable have already been reclaimed and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * we do not want to block exit_mmap by keeping mm ref
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) * count elevated without a good reason.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) struct mmu_gather tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) vma, mm, vma->vm_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) vma->vm_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) tlb_gather_mmu(&tlb, mm, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) tlb_finish_mmu(&tlb, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) unmap_page_range(&tlb, vma, range.start, range.end, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) tlb_finish_mmu(&tlb, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) * Reaps the address space of the give task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * Returns true on success and false if none or part of the address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * has been reclaimed and the caller should retry later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) bool ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if (!mmap_read_trylock(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) trace_skip_task_reaping(tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * work on the mm anymore. The check for MMF_OOM_SKIP must run
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * under mmap_lock for reading because it serializes against the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) trace_skip_task_reaping(tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) trace_start_task_reaping(tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) /* failed to reap part of the address space. Try again later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) ret = __oom_reap_task_mm(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) goto out_finish;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) task_pid_nr(tsk), tsk->comm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) K(get_mm_counter(mm, MM_ANONPAGES)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) K(get_mm_counter(mm, MM_FILEPAGES)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) K(get_mm_counter(mm, MM_SHMEMPAGES)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) out_finish:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) trace_finish_task_reaping(tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) #define MAX_OOM_REAP_RETRIES 10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) static void oom_reap_task(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) int attempts = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) struct mm_struct *mm = tsk->signal->oom_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) /* Retry the mmap_read_trylock(mm) a few times */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) schedule_timeout_idle(HZ/10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) if (attempts <= MAX_OOM_REAP_RETRIES ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) test_bit(MMF_OOM_SKIP, &mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) task_pid_nr(tsk), tsk->comm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) sched_show_task(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) debug_show_all_locks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) tsk->oom_reaper_list = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * Hide this mm from OOM killer because it has been either reaped or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) * somebody can't call mmap_write_unlock(mm).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) set_bit(MMF_OOM_SKIP, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) /* Drop a reference taken by wake_oom_reaper */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) put_task_struct(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) static int oom_reaper(void *unused)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) while (true) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) struct task_struct *tsk = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) spin_lock(&oom_reaper_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) if (oom_reaper_list != NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) tsk = oom_reaper_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) oom_reaper_list = tsk->oom_reaper_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) spin_unlock(&oom_reaper_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) if (tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) oom_reap_task(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) static void wake_oom_reaper(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) /* mm is already queued? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) get_task_struct(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) spin_lock(&oom_reaper_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) tsk->oom_reaper_list = oom_reaper_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) oom_reaper_list = tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) spin_unlock(&oom_reaper_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) trace_wake_reaper(tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) wake_up(&oom_reaper_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) static int __init oom_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) subsys_initcall(oom_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) static inline void wake_oom_reaper(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) #endif /* CONFIG_MMU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * under task_lock or operate on the current).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) static void __mark_oom_victim(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) struct mm_struct *mm = tsk->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) mmgrab(tsk->signal->oom_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) set_bit(MMF_OOM_VICTIM, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * mark_oom_victim - mark the given task as OOM victim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * @tsk: task to mark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) * Has to be called with oom_lock held and never after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) * oom has been disabled already.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) * under task_lock or operate on the current).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) static void mark_oom_victim(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) WARN_ON(oom_killer_disabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) /* OOM killer might race with memcg OOM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) /* oom_mm is bound to the signal struct life time. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) __mark_oom_victim(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) * Make sure that the task is woken up from uninterruptible sleep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * if it is frozen because OOM killer wouldn't be able to free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * any memory and livelock. freezing_slow_path will tell the freezer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) * that TIF_MEMDIE tasks should be ignored.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) __thaw_task(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) atomic_inc(&oom_victims);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) trace_mark_victim(tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) * exit_oom_victim - note the exit of an OOM victim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) void exit_oom_victim(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) clear_thread_flag(TIF_MEMDIE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) if (!atomic_dec_return(&oom_victims))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) wake_up_all(&oom_victims_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) * oom_killer_enable - enable OOM killer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) void oom_killer_enable(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) oom_killer_disabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) pr_info("OOM killer enabled.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) * oom_killer_disable - disable OOM killer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) * @timeout: maximum timeout to wait for oom victims in jiffies
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) * Forces all page allocations to fail rather than trigger OOM killer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) * Will block and wait until all OOM victims are killed or the given
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) * timeout expires.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) * The function cannot be called when there are runnable user tasks because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) * the userspace would see unexpected allocation failures as a result. Any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) * new usage of this function should be consulted with MM people.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) * Returns true if successful and false if the OOM killer cannot be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) bool oom_killer_disable(signed long timeout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) signed long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * Make sure to not race with an ongoing OOM killer. Check that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * current is not killed (possibly due to sharing the victim's memory).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) if (mutex_lock_killable(&oom_lock))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) oom_killer_disabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) mutex_unlock(&oom_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) ret = wait_event_interruptible_timeout(oom_victims_wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) !atomic_read(&oom_victims), timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) if (ret <= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) oom_killer_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) pr_info("OOM killer disabled.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) static inline bool __task_will_free_mem(struct task_struct *task)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) struct signal_struct *sig = task->signal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) * A coredumping process may sleep for an extended period in exit_mm(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) * so the oom killer cannot assume that the process will promptly exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) * and release memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) if (sig->flags & SIGNAL_GROUP_COREDUMP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) if (sig->flags & SIGNAL_GROUP_EXIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) if (thread_group_empty(task) && (task->flags & PF_EXITING))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) * Checks whether the given task is dying or exiting and likely to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) * release its address space. This means that all threads and processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * sharing the same mm have to be killed or exiting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * Caller has to make sure that task->mm is stable (hold task_lock or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * it operates on the current).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) static bool task_will_free_mem(struct task_struct *task)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) struct mm_struct *mm = task->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) struct task_struct *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) bool ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) * Skip tasks without mm because it might have passed its exit_mm and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) * exit_oom_victim. oom_reaper could have rescued that but do not rely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) * on that for now. We can consider find_lock_task_mm in future.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) if (!mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) if (!__task_will_free_mem(task))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) * This task has already been drained by the oom reaper so there are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) * only small chances it will free some more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) if (test_bit(MMF_OOM_SKIP, &mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) if (atomic_read(&mm->mm_users) <= 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) * Make sure that all tasks which share the mm with the given tasks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) * are dying as well to make sure that a) nobody pins its mm and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) * b) the task is also reapable by the oom reaper.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) for_each_process(p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) if (!process_shares_mm(p, mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) if (same_thread_group(task, p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) ret = __task_will_free_mem(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) static void __oom_kill_process(struct task_struct *victim, const char *message)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) struct task_struct *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) bool can_oom_reap = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) p = find_lock_task_mm(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) if (!p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) message, task_pid_nr(victim), victim->comm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) put_task_struct(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) } else if (victim != p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) get_task_struct(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) put_task_struct(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) victim = p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) /* Get a reference to safely compare mm after task_unlock(victim) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) mm = victim->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) mmgrab(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) /* Raise event before sending signal: task reaper must see this */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) count_vm_event(OOM_KILL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) * We should send SIGKILL before granting access to memory reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) * in order to prevent the OOM victim from depleting the memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * reserves from the user space under its control.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) mark_oom_victim(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) K(get_mm_counter(mm, MM_ANONPAGES)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) K(get_mm_counter(mm, MM_FILEPAGES)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) K(get_mm_counter(mm, MM_SHMEMPAGES)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) from_kuid(&init_user_ns, task_uid(victim)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) task_unlock(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * Kill all user processes sharing victim->mm in other thread groups, if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * any. They don't get access to memory reserves, though, to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) * depletion of all memory. This prevents mm->mmap_lock livelock when an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * oom killed thread cannot exit because it requires the semaphore and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) * its contended by another thread trying to allocate memory itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) * That thread will now get access to memory reserves since it has a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) * pending fatal signal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) for_each_process(p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) if (!process_shares_mm(p, mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) if (same_thread_group(p, victim))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) if (is_global_init(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) can_oom_reap = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) set_bit(MMF_OOM_SKIP, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) task_pid_nr(victim), victim->comm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) task_pid_nr(p), p->comm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) * No kthead_use_mm() user needs to read from the userspace so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) * we are ok to reap it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) if (unlikely(p->flags & PF_KTHREAD))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) if (can_oom_reap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) wake_oom_reaper(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) mmdrop(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) put_task_struct(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) #undef K
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) * Kill provided task unless it's secured by setting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) * oom_score_adj to OOM_SCORE_ADJ_MIN.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) static int oom_kill_memcg_member(struct task_struct *task, void *message)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) !is_global_init(task)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) get_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) __oom_kill_process(task, message);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) static void oom_kill_process(struct oom_control *oc, const char *message)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) struct task_struct *victim = oc->chosen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) struct mem_cgroup *oom_group;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) DEFAULT_RATELIMIT_BURST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) * If the task is already exiting, don't alarm the sysadmin or kill
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) * its children or threads, just give it access to memory reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) * so it can die quickly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) task_lock(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) if (task_will_free_mem(victim)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) mark_oom_victim(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) wake_oom_reaper(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) task_unlock(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) put_task_struct(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) task_unlock(victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) if (__ratelimit(&oom_rs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) dump_header(oc, victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * Do we need to kill the entire memory cgroup?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) * Or even one of the ancestor memory cgroups?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) * Check this out before killing the victim task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) __oom_kill_process(victim, message);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * If necessary, kill all tasks in the selected memory cgroup.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) if (oom_group) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) mem_cgroup_print_oom_group(oom_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) (void*)message);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) mem_cgroup_put(oom_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * Determines whether the kernel must panic because of the panic_on_oom sysctl.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) static void check_panic_on_oom(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) if (likely(!sysctl_panic_on_oom))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) if (sysctl_panic_on_oom != 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) * does not panic for cpuset, mempolicy, or memcg allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) * failures.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) if (oc->constraint != CONSTRAINT_NONE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) /* Do not panic for oom kills triggered by sysrq */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) if (is_sysrq_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) dump_header(oc, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) panic("Out of memory: %s panic_on_oom is enabled\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) int register_oom_notifier(struct notifier_block *nb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) return blocking_notifier_chain_register(&oom_notify_list, nb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) EXPORT_SYMBOL_GPL(register_oom_notifier);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) int unregister_oom_notifier(struct notifier_block *nb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) return blocking_notifier_chain_unregister(&oom_notify_list, nb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) EXPORT_SYMBOL_GPL(unregister_oom_notifier);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) * out_of_memory - kill the "best" process when we run out of memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) * @oc: pointer to struct oom_control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) * If we run out of memory, we have the choice between either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) * killing a random task (bad), letting the system crash (worse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) * OR try to be smart about which process to kill. Note that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) * don't have to be perfect here, we just have to be good.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) bool out_of_memory(struct oom_control *oc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) unsigned long freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) if (oom_killer_disabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) if (!is_memcg_oom(oc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) if (freed > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) /* Got some memory back in the last second. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) * If current has a pending SIGKILL or is exiting, then automatically
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) * select it. The goal is to allow it to allocate so that it may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) * quickly exit and free its memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) if (task_will_free_mem(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) mark_oom_victim(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) wake_oom_reaper(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) * The OOM killer does not compensate for IO-less reclaim.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) * pagefault_out_of_memory lost its gfp context so we have to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) * make sure exclude 0 mask - all other users should have at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) * invoke the OOM killer even if it is a GFP_NOFS allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) * Check if there were limitations on the allocation (only relevant for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) * NUMA and memcg) that may require different handling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) oc->constraint = constrained_alloc(oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) oc->nodemask = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) check_panic_on_oom(oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) current->mm && !oom_unkillable_task(current) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) oom_cpuset_eligible(current, oc) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) get_task_struct(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) oc->chosen = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) select_bad_process(oc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) /* Found nothing?!?! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) if (!oc->chosen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) int ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) trace_android_vh_oom_check_panic(oc, &ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) dump_header(oc, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) pr_warn("Out of memory and no killable processes...\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) * If we got here due to an actual allocation at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) * system level, we cannot survive this and will enter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) * an endless loop in the allocator. Bail out now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) panic("System is deadlocked on memory\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) if (oc->chosen && oc->chosen != (void *)-1UL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) "Memory cgroup out of memory");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) return !!oc->chosen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * The pagefault handler calls here because some allocation has failed. We have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) * to take care of the memcg OOM here because this is the only safe context without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * any locks held but let the oom killer triggered from the allocation context care
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * about the global OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) void pagefault_out_of_memory(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) DEFAULT_RATELIMIT_BURST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) if (mem_cgroup_oom_synchronize(true))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) if (__ratelimit(&pfoom_rs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) #ifdef CONFIG_MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) struct mm_struct *mm = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) struct task_struct *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) unsigned int f_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) bool reap = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) struct pid *pid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) if (flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) pid = pidfd_get_pid(pidfd, &f_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) if (IS_ERR(pid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) return PTR_ERR(pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) task = get_pid_task(pid, PIDTYPE_TGID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) if (!task) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) ret = -ESRCH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) goto put_pid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) * Make sure to choose a thread which still has a reference to mm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) * during the group exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) p = find_lock_task_mm(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) if (!p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) ret = -ESRCH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) goto put_task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) mm = p->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) mmgrab(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) * If we are too late and exit_mmap already checked mm_is_oom_victim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) * then will block on mmap_read_lock until exit_mmap releases mmap_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) set_bit(MMF_OOM_VICTIM, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) if (task_will_free_mem(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) reap = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) /* Error only if the work has not been done already */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) if (!test_bit(MMF_OOM_SKIP, &mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) task_unlock(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) if (!reap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) goto drop_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) if (mmap_read_lock_killable(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) goto drop_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) * possible change in exit_mmap is seen
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) ret = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) drop_mm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) mmdrop(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) put_task:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) put_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) put_pid:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) put_pid(pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) return -ENOSYS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) #endif /* CONFIG_MMU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) void add_to_oom_reaper(struct task_struct *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) p = find_lock_task_mm(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) if (!p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) get_task_struct(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) if (task_will_free_mem(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) __mark_oom_victim(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) wake_oom_reaper(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) task_unlock(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) put_task_struct(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) }