^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2008, 2009 Intel Corporation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Authors: Andi Kleen, Fengguang Wu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * High level machine check handler. Handles pages reported by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * hardware as being corrupted usually due to a multi-bit ECC memory or cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * In addition there is a "soft offline" entry point that allows stop using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * not-yet-corrupted-by-suspicious pages without killing anything.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Handles page cache pages in various states. The tricky part
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * here is that we can access any page asynchronously in respect to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * other VM users, because memory failures could happen anytime and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * anywhere. This could violate some of their assumptions. This is why
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * this code has to be extremely careful. Generally it tries to use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * normal locking rules, as in get the standard locks, even if that means
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * the error handling takes potentially a long time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * It can be very tempting to add handling for obscure cases here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * In general any code for handling new cases should only be added iff:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * - You know how to test it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * - You have a test that can be added to mce-test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * - The case actually shows up as a frequent (top 10) page state in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * tools/vm/page-types when running a real workload.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * There are several operations here with exponential complexity because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * of unsuitable VM data structures. For example the operation to map back
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * from RMAP chains to processes has to walk the complete process list and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * has non linear complexity with the number. But since memory corruptions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * are rare we hope to get away with this. This avoids impacting the core
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/page-flags.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/kernel-page-flags.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/sched/task.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/ksm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <linux/migrate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <linux/suspend.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include <linux/memory_hotplug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #include <linux/memremap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #include <linux/kfifo.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #include <linux/ratelimit.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #include <linux/page-isolation.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #include "ras/ras_event.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) int sysctl_memory_failure_early_kill __read_mostly = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) int sysctl_memory_failure_recovery __read_mostly = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) if (hugepage_or_freepage) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * Doing this check for free pages is also fine since dissolve_free_huge_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * returns 0 for non-hugetlb pages as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * We could fail to take off the target page from buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * for example due to racy page allocaiton, but that's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * acceptable because soft-offlined page is not broken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * and if someone really want to use it, they should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * take it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) SetPageHWPoison(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) if (release)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) page_ref_inc(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) num_poisoned_pages_inc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) u32 hwpoison_filter_enable = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) u32 hwpoison_filter_dev_major = ~0U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) u32 hwpoison_filter_dev_minor = ~0U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) u64 hwpoison_filter_flags_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) u64 hwpoison_filter_flags_value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) static int hwpoison_filter_dev(struct page *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) dev_t dev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) if (hwpoison_filter_dev_major == ~0U &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) hwpoison_filter_dev_minor == ~0U)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * page_mapping() does not accept slab pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) if (PageSlab(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) mapping = page_mapping(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) if (mapping == NULL || mapping->host == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) dev = mapping->host->i_sb->s_dev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) if (hwpoison_filter_dev_major != ~0U &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) hwpoison_filter_dev_major != MAJOR(dev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) if (hwpoison_filter_dev_minor != ~0U &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) hwpoison_filter_dev_minor != MINOR(dev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) static int hwpoison_filter_flags(struct page *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) if (!hwpoison_filter_flags_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) hwpoison_filter_flags_value)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * This allows stress tests to limit test scope to a collection of tasks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * by putting them under some memcg. This prevents killing unrelated/important
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * processes such as /sbin/init. Note that the target task may share clean
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * pages with init (eg. libc text), which is harmless. If the target task
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * share _dirty_ pages with another task B, the test scheme must make sure B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) * is also included in the memcg. At last, due to race conditions this filter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) * can only guarantee that the page either belongs to the memcg tasks, or is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * a freed page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) u64 hwpoison_filter_memcg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) static int hwpoison_filter_task(struct page *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) if (!hwpoison_filter_memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) if (page_cgroup_ino(p) != hwpoison_filter_memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) static int hwpoison_filter_task(struct page *p) { return 0; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) int hwpoison_filter(struct page *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) if (!hwpoison_filter_enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) if (hwpoison_filter_dev(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) if (hwpoison_filter_flags(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) if (hwpoison_filter_task(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) int hwpoison_filter(struct page *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) EXPORT_SYMBOL_GPL(hwpoison_filter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * Kill all processes that have a poisoned page mapped and then isolate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * General strategy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * Find all processes having the page mapped and kill them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) * But we keep a page reference around so that the page is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * actually freed yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * Then stash the page away
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) * There's no convenient way to get back to mapped processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) * from the VMAs. So do a brute-force search over all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) * running processes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * Remember that machine checks are not common (or rather
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * if they are common you have other problems), so this shouldn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * be a performance issue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * Also there are some races possible while we get from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * error detection to actually handle it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) struct to_kill {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) struct list_head nd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) short size_shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) * Send all the processes who have the page mapped a signal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) * ``action optional'' if they are not immediately affected by the error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) * ``action required'' if error happened in current execution context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) struct task_struct *t = tk->tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) short addr_lsb = tk->size_shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) pfn, t->comm, t->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) if (flags & MF_ACTION_REQUIRED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) WARN_ON_ONCE(t != current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) ret = force_sig_mceerr(BUS_MCEERR_AR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) (void __user *)tk->addr, addr_lsb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) * Don't use force here, it's convenient if the signal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) * can be temporarily blocked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * This could cause a loop when the user sets SIGBUS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) * to SIG_IGN, but hopefully no one will do that?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) addr_lsb, t); /* synchronous? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) t->comm, t->pid, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * When a unknown page type is encountered drain as many buffers as possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * in the hope to turn the page into a LRU or free page, which we can handle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) void shake_page(struct page *p, int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (PageHuge(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (!PageSlab(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) lru_add_drain_all();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) if (PageLRU(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) drain_all_pages(page_zone(p));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) if (PageLRU(p) || is_free_buddy_page(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) * Only call shrink_node_slabs here (which would also shrink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) * other caches) if access is not potentially fatal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) if (access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) drop_slab_node(page_to_nid(p));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) EXPORT_SYMBOL_GPL(shake_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) static unsigned long dev_pagemap_mapping_shift(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) unsigned long address = vma_address(page, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) pgd = pgd_offset(vma->vm_mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) if (!pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) p4d = p4d_offset(pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) if (!p4d_present(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) pud = pud_offset(p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) if (!pud_present(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) if (pud_devmap(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) return PUD_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) pmd = pmd_offset(pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) if (!pmd_present(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) if (pmd_devmap(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) return PMD_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) pte = pte_offset_map(pmd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) if (!pte_present(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) if (pte_devmap(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) return PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * Failure handling: if we can't find or can't kill a process there's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) * not much we can do. We just print a message and ignore otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) * Schedule a process for later kill.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) static void add_to_kill(struct task_struct *tsk, struct page *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) struct list_head *to_kill)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) struct to_kill *tk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) if (!tk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) pr_err("Memory failure: Out of memory while machine check handling\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) tk->addr = page_address_in_vma(p, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (is_zone_device_page(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) tk->size_shift = dev_pagemap_mapping_shift(p, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) tk->size_shift = page_shift(compound_head(p));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) * Send SIGKILL if "tk->addr == -EFAULT". Also, as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) * "tk->size_shift" is always non-zero for !is_zone_device_page(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) * so "tk->size_shift == 0" effectively checks no mapping on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) * to a process' address space, it's possible not all N VMAs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) * contain mappings for the page, but at least one VMA does.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * Only deliver SIGBUS with payload derived from the VMA that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * has a mapping for the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) if (tk->addr == -EFAULT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) pr_info("Memory failure: Unable to find user space address %lx in %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) page_to_pfn(p), tsk->comm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) } else if (tk->size_shift == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) kfree(tk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) get_task_struct(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) tk->tsk = tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) list_add_tail(&tk->nd, to_kill);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) * Kill the processes that have been collected earlier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) * Only do anything when DOIT is set, otherwise just free the list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) * (this is used for clean pages which do not need killing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) * Also when FAIL is set do a force kill because something went
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) * wrong earlier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) struct to_kill *tk, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) list_for_each_entry_safe (tk, next, to_kill, nd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) if (forcekill) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * In case something went wrong with munmapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * make sure the process doesn't catch the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * signal and then access the memory. Just kill it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) if (fail || tk->addr == -EFAULT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) pfn, tk->tsk->comm, tk->tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) tk->tsk, PIDTYPE_PID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) * In theory the process could have mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) * something else on the address in-between. We could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) * check for that, but we need to tell the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) * process anyways.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) else if (kill_proc(tk, pfn, flags) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) pfn, tk->tsk->comm, tk->tsk->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) put_task_struct(tk->tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) kfree(tk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) * on behalf of the thread group. Return task_struct of the (first found)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) * dedicated thread if found, and return NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) * We already hold read_lock(&tasklist_lock) in the caller, so we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) * have to call rcu_read_lock/unlock() in this function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) struct task_struct *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) for_each_thread(tsk, t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) if (t->flags & PF_MCE_PROCESS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) if (t->flags & PF_MCE_EARLY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) return t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) if (sysctl_memory_failure_early_kill)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) return t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) * Determine whether a given process is "early kill" process which expects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) * to be signaled when some page under the process is hwpoisoned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) * Return task_struct of the dedicated thread (main thread unless explicitly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) * specified) if the process is "early kill," and otherwise returns NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) * Note that the above is true for Action Optional case, but not for Action
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) * Required case where SIGBUS should sent only to the current thread.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) static struct task_struct *task_early_kill(struct task_struct *tsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) int force_early)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) if (!tsk->mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) if (force_early) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * Comparing ->mm here because current task might represent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * a subthread, while tsk always points to the main thread.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) if (tsk->mm == current->mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) return current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) return find_early_kill_thread(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) * Collect processes when the error hit an anonymous page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) static void collect_procs_anon(struct page *page, struct list_head *to_kill,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) int force_early)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) struct anon_vma *av;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) av = page_lock_anon_vma_read(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) if (av == NULL) /* Not actually mapped anymore */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) pgoff = page_to_pgoff(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) read_lock(&tasklist_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) for_each_process (tsk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) struct anon_vma_chain *vmac;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) struct task_struct *t = task_early_kill(tsk, force_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) if (!t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) anon_vma_interval_tree_foreach(vmac, &av->rb_root,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) pgoff, pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) vma = vmac->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) if (!page_mapped_in_vma(page, vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) if (vma->vm_mm == t->mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) add_to_kill(t, page, vma, to_kill);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) read_unlock(&tasklist_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) page_unlock_anon_vma_read(av);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) * Collect processes when the error hit a file mapped page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) static void collect_procs_file(struct page *page, struct list_head *to_kill,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) int force_early)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) struct address_space *mapping = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) read_lock(&tasklist_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) pgoff = page_to_pgoff(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) for_each_process(tsk) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) struct task_struct *t = task_early_kill(tsk, force_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) if (!t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * Send early kill signal to tasks where a vma covers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * the page but the corrupted page is not necessarily
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * mapped it in its pte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * Assume applications who requested early kill want
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * to be informed of all such data corruptions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) if (vma->vm_mm == t->mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) add_to_kill(t, page, vma, to_kill);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) read_unlock(&tasklist_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) * Collect the processes who have the corrupted page mapped to kill.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) static void collect_procs(struct page *page, struct list_head *tokill,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) int force_early)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) if (!page->mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) if (PageAnon(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) collect_procs_anon(page, tokill, force_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) collect_procs_file(page, tokill, force_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) static const char *action_name[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) [MF_IGNORED] = "Ignored",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) [MF_FAILED] = "Failed",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) [MF_DELAYED] = "Delayed",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) [MF_RECOVERED] = "Recovered",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) static const char * const action_page_types[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) [MF_MSG_KERNEL] = "reserved kernel page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) [MF_MSG_SLAB] = "kernel slab page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) [MF_MSG_HUGE] = "huge page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) [MF_MSG_FREE_HUGE] = "free huge page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) [MF_MSG_DIRTY_LRU] = "dirty LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) [MF_MSG_CLEAN_LRU] = "clean LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) [MF_MSG_BUDDY] = "free buddy page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) [MF_MSG_DAX] = "dax page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) [MF_MSG_UNSPLIT_THP] = "unsplit thp",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) [MF_MSG_UNKNOWN] = "unknown page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) * XXX: It is possible that a page is isolated from LRU cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) * and then kept in swap cache or failed to remove from page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) * The page count will stop it from being freed by unpoison.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) * Stress tests should be aware of this memory leak problem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) static int delete_from_lru_cache(struct page *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) if (!isolate_lru_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) * Clear sensible page flags, so that the buddy system won't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) * complain when the page is unpoison-and-freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) ClearPageActive(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) ClearPageUnevictable(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * Poisoned page might never drop its ref count to 0 so we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * to uncharge it manually from its memcg.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) mem_cgroup_uncharge(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) * drop the page count elevated by isolate_lru_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) put_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) static int truncate_error_page(struct page *p, unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) int ret = MF_FAILED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) if (mapping->a_ops->error_remove_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) int err = mapping->a_ops->error_remove_page(mapping, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) if (err != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) pfn, err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) } else if (page_has_private(p) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) !try_to_release_page(p, GFP_NOIO)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) pr_info("Memory failure: %#lx: failed to release buffers\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) ret = MF_RECOVERED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) * If the file system doesn't support it just invalidate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) * This fails on dirty or anything with private pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) if (invalidate_inode_page(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) ret = MF_RECOVERED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) pr_info("Memory failure: %#lx: Failed to invalidate\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) * Error hit kernel page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * Do nothing, try to be lucky and not touch this instead. For a few cases we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) * could be more sophisticated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) static int me_kernel(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) return MF_IGNORED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) * Page in unknown state. Do nothing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) static int me_unknown(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) return MF_FAILED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) * Clean (or cleaned) page cache page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) static int me_pagecache_clean(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) delete_from_lru_cache(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) * For anonymous pages we're done the only reference left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) * should be the one m_f() holds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) if (PageAnon(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) return MF_RECOVERED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) * Now truncate the page in the page cache. This is really
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) * more like a "temporary hole punch"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) * Don't do this for block devices when someone else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * has a reference, because it could be file system metadata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * and that's not safe to truncate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) mapping = page_mapping(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) if (!mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * Page has been teared down in the meanwhile
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) return MF_FAILED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) * Truncation is a bit tricky. Enable it per file system for now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) * Open: to take i_mutex or not for this? Right now we don't.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) return truncate_error_page(p, pfn, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * Dirty pagecache page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * Issues: when the error hit a hole page the error is not properly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * propagated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) static int me_pagecache_dirty(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) struct address_space *mapping = page_mapping(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) SetPageError(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) /* TBD: print more information about the file. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) if (mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) * IO error will be reported by write(), fsync(), etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) * who check the mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) * This way the application knows that something went
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * wrong with its dirty file data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * There's one open issue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) * The EIO will be only reported on the next IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) * operation and then cleared through the IO map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) * Normally Linux has two mechanisms to pass IO error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * first through the AS_EIO flag in the address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) * and then through the PageError flag in the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * Since we drop pages on memory failure handling the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) * only mechanism open to use is through AS_AIO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) * This has the disadvantage that it gets cleared on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * the first operation that returns an error, while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * the PageError bit is more sticky and only cleared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * when the page is reread or dropped. If an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * application assumes it will always get error on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * fsync, but does other operations on the fd before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) * and the page is dropped between then the error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) * will not be properly reported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) * This can already happen even without hwpoisoned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) * pages: first on metadata IO errors (which only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) * report through AS_EIO) or when the page is dropped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) * at the wrong time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) * So right now we assume that the application DTRT on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) * the first EIO, but we're not worse than other parts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) * of the kernel.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) mapping_set_error(mapping, -EIO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) return me_pagecache_clean(p, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * Clean and dirty swap cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) * Dirty swap cache page is tricky to handle. The page could live both in page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) * cache and swap cache(ie. page is freshly swapped in). So it could be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) * referenced concurrently by 2 types of PTEs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) * normal PTEs and swap PTEs. We try to handle them consistently by calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) * and then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) * - clear dirty bit to prevent IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) * - remove from LRU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) * - but keep in the swap cache, so that when we return to it on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) * a later page fault, we know the application is accessing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) * corrupted data and shall be killed (we installed simple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) * interception code in do_swap_page to catch it).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) * Clean swap cache pages can be directly isolated. A later page fault will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) * bring in the known good data from disk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) static int me_swapcache_dirty(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) ClearPageDirty(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) /* Trigger EIO in shmem: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) ClearPageUptodate(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) if (!delete_from_lru_cache(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) return MF_DELAYED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) return MF_FAILED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) static int me_swapcache_clean(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) delete_from_swap_cache(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) if (!delete_from_lru_cache(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) return MF_RECOVERED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) return MF_FAILED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) * Huge pages. Needs work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * Issues:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) * To narrow down kill region to one page, we need to break up pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) static int me_huge_page(struct page *p, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) int res = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) struct page *hpage = compound_head(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) if (!PageHuge(hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) return MF_DELAYED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) mapping = page_mapping(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) if (mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) res = truncate_error_page(hpage, pfn, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) unlock_page(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * migration entry prevents later access on error anonymous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * hugepage, so we can free and dissolve it into buddy to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * save healthy subpages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) if (PageAnon(hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) put_page(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) dissolve_free_huge_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) res = MF_RECOVERED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) lock_page(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) * Various page states we can handle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) * A page state is defined by its current page->flags bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) * The table matches them in order and calls the right handler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) * This is quite tricky because we can access page at any time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) * in its live cycle, so all accesses have to be extremely careful.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) * This is not complete. More states could be added.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) * For any missing state don't attempt recovery.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) #define dirty (1UL << PG_dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) #define unevict (1UL << PG_unevictable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) #define mlock (1UL << PG_mlocked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) #define lru (1UL << PG_lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) #define head (1UL << PG_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) #define slab (1UL << PG_slab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) #define reserved (1UL << PG_reserved)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) static struct page_state {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) unsigned long mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) unsigned long res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) enum mf_action_page_type type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) int (*action)(struct page *p, unsigned long pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) } error_states[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) { reserved, reserved, MF_MSG_KERNEL, me_kernel },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) * free pages are specially detected outside this table:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) * PG_buddy pages only make a small fraction of all free pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) * Could in theory check if slab page is free or if we can drop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) * currently unused objects without touching them. But just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) * treat it as standard kernel for now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) { slab, slab, MF_MSG_SLAB, me_kernel },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) { head, head, MF_MSG_HUGE, me_huge_page },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) * Catchall entry: must be at end.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) { 0, 0, MF_MSG_UNKNOWN, me_unknown },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) #undef dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) #undef sc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) #undef unevict
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) #undef mlock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) #undef lru
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) #undef head
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) #undef slab
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) #undef reserved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * "Dirty/Clean" indication is not 100% accurate due to the possibility of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) * setting PG_dirty outside page lock. See also comment above set_page_dirty().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) static void action_result(unsigned long pfn, enum mf_action_page_type type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) enum mf_result result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) trace_memory_failure_event(pfn, type, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) pfn, action_page_types[type], action_name[result]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) static int page_action(struct page_state *ps, struct page *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) int result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) result = ps->action(p, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) count = page_count(p) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) if (count > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) pfn, action_page_types[ps->type], count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) result = MF_FAILED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) action_result(pfn, ps->type, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) /* Could do more checks here if page looks ok */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) * Could adjust zone counters here to correct for the missing page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * get_hwpoison_page() - Get refcount for memory error handling:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * @page: raw error page (hit by memory error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) * Return: return 0 if failed to grab the refcount, otherwise true (some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * non-zero value.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) static int get_hwpoison_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) if (!PageHuge(head) && PageTransHuge(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) * Non anonymous thp exists only in allocation/free time. We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) * can't handle such a case correctly, so let's give it up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) * This should be better than triggering BUG_ON when kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) * tries to touch the "partially handled" page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) if (!PageAnon(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) pr_err("Memory failure: %#lx: non anonymous thp\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) page_to_pfn(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) if (get_page_unless_zero(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) if (head == compound_head(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) pr_info("Memory failure: %#lx cannot catch tail\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) page_to_pfn(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) put_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) * Do all that is necessary to remove user space mappings. Unmap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) * the pages and send SIGBUS to the processes if the data was dirty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) int flags, struct page **hpagep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) enum ttu_flags ttu = TTU_IGNORE_MLOCK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) LIST_HEAD(tokill);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) bool unmap_success = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) int kill = 1, forcekill;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) struct page *hpage = *hpagep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) bool mlocked = PageMlocked(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * Here we are interested only in user-mapped pages, so skip any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) * other types of pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) if (PageReserved(p) || PageSlab(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) if (!(PageLRU(hpage) || PageHuge(p)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) * This check implies we don't kill processes if their pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) * are in the swap cache early. Those are always late kills.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) if (!page_mapped(hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) if (PageKsm(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) if (PageSwapCache(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) ttu |= TTU_IGNORE_HWPOISON;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) * Propagate the dirty bit from PTEs to struct page first, because we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) * need this to decide if we should kill or just drop the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * XXX: the dirty test could be racy: set_page_dirty() may not always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) * be called inside page lock (it's recommended but not enforced).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) mapping = page_mapping(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) mapping_can_writeback(mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) if (page_mkclean(hpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) SetPageDirty(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) kill = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) ttu |= TTU_IGNORE_HWPOISON;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) * First collect all the processes that have the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * mapped in dirty form. This has to be done before try_to_unmap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) * because ttu takes the rmap data structures down.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * Error handling: We ignore errors here because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) * there's nothing that can be done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) if (kill)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) if (!PageHuge(hpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) unmap_success = try_to_unmap(hpage, ttu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) if (!PageAnon(hpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) * For hugetlb pages in shared mappings, try_to_unmap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) * could potentially call huge_pmd_unshare. Because of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) * this, take semaphore in write mode here and set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) * TTU_RMAP_LOCKED to indicate we have taken the lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) * at this higer level.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) mapping = hugetlb_page_mapping_lock_write(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) if (mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) unmap_success = try_to_unmap(hpage,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) ttu|TTU_RMAP_LOCKED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) i_mmap_unlock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) unmap_success = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) unmap_success = try_to_unmap(hpage, ttu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) if (!unmap_success)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) pfn, page_mapcount(hpage));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) * try_to_unmap() might put mlocked page in lru cache, so call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) * shake_page() again to ensure that it's flushed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) if (mlocked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) shake_page(hpage, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) * Now that the dirty bit has been propagated to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) * struct page and all unmaps done we can decide if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) * killing is needed or not. Only kill when the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) * was dirty or the process is not restartable,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) * otherwise the tokill list is merely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) * freed. When there was a problem unmapping earlier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) * use a more force-full uncatchable kill to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * any accesses to the poisoned memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) return unmap_success;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) static int identify_page_state(unsigned long pfn, struct page *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) unsigned long page_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) struct page_state *ps;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) * The first check uses the current page flags which may not have any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) * relevant information. The second check with the saved page flags is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) * carried out only if the first check can't determine the page status.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) for (ps = error_states;; ps++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) if ((p->flags & ps->mask) == ps->res)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) page_flags |= (p->flags & (1UL << PG_dirty));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) if (!ps->mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) for (ps = error_states;; ps++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) if ((page_flags & ps->mask) == ps->res)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) return page_action(ps, p, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) static int try_to_split_thp_page(struct page *page, const char *msg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) if (!PageAnon(page) || unlikely(split_huge_page(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) if (!PageAnon(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) pr_info("%s: %#lx: thp split failed\n", msg, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) static int memory_failure_hugetlb(unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) struct page *p = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) struct page *head = compound_head(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) unsigned long page_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) if (TestSetPageHWPoison(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) pr_err("Memory failure: %#lx: already hardware poisoned\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) num_poisoned_pages_inc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) * Check "filter hit" and "race with other subpage."
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) lock_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) if (PageHWPoison(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) || (p != head && TestSetPageHWPoison(head))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) num_poisoned_pages_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) unlock_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) unlock_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) dissolve_free_huge_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) lock_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) page_flags = head->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) if (!PageHWPoison(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) num_poisoned_pages_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) unlock_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) put_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) * simply disable it. In order to make it work properly, we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * make sure that:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * - conversion of a pud that maps an error hugetlb into hwpoison
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) * entry properly works, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) * - other mm code walking over page table is aware of pud-aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) * hwpoison entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) res = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) res = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) res = identify_page_state(pfn, p, page_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) unlock_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) struct dev_pagemap *pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) const bool unmap_success = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) unsigned long size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) struct to_kill *tk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) LIST_HEAD(tokill);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) int rc = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) loff_t start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) dax_entry_t cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) if (flags & MF_COUNT_INCREASED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) * Drop the extra refcount in case we come from madvise().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) /* device metadata space is not recoverable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) if (!pgmap_pfn_valid(pgmap, pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) rc = -ENXIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) * Prevent the inode from being freed while we are interrogating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) * the address_space, typically this would be handled by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) * lock_page(), but dax pages do not use the page lock. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) * also prevents changes to the mapping of this pfn until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) * poison signaling is complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) cookie = dax_lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) if (!cookie)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) if (hwpoison_filter(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) * TODO: Handle HMM pages which may need coordination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) * with device-side memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) * Use this flag as an indication that the dax page has been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) * remapped UC to prevent speculative consumption of poison.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) SetPageHWPoison(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) * Unlike System-RAM there is no possibility to swap in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * different physical page at a given virtual address, so all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) * userspace consumption of ZONE_DEVICE memory necessitates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) * SIGBUS (i.e. MF_MUST_KILL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) list_for_each_entry(tk, &tokill, nd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) if (tk->size_shift)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) size = max(size, 1UL << tk->size_shift);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) if (size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) * Unmap the largest mapping to avoid breaking up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) * device-dax mappings which are constant size. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) * actual size of the mapping being torn down is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) * communicated in siginfo, see kill_proc()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) start = (page->index << PAGE_SHIFT) & ~(size - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) unmap_mapping_range(page->mapping, start, size, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) dax_unlock_page(page, cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) /* drop pgmap ref acquired in caller */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) put_dev_pagemap(pgmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) * memory_failure - Handle memory failure of a page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) * @pfn: Page Number of the corrupted page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) * @flags: fine tune action taken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) * This function is called by the low level machine check code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) * of an architecture when it detects hardware memory corruption
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) * of a page. It tries its best to recover, which includes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) * dropping pages, killing processes etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) * The function is primarily of use for corruptions that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) * happen outside the current execution context (e.g. when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) * detected by a background scrubber)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) * Must run in process context (e.g. a work queue) with interrupts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) * enabled and no spinlocks hold.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) int memory_failure(unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) struct page *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) struct page *hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) struct page *orig_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) struct dev_pagemap *pgmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) unsigned long page_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) if (!sysctl_memory_failure_recovery)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) panic("Memory failure on page %lx", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) p = pfn_to_online_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) if (!p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) if (pfn_valid(pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) pgmap = get_dev_pagemap(pfn, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) if (pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) return memory_failure_dev_pagemap(pfn, flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) pgmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) pr_err("Memory failure: %#lx: memory outside kernel control\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) return -ENXIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) if (PageHuge(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) return memory_failure_hugetlb(pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) if (TestSetPageHWPoison(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) pr_err("Memory failure: %#lx: already hardware poisoned\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) orig_head = hpage = compound_head(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) num_poisoned_pages_inc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) * We need/can do nothing about count=0 pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) * 1) it's a free page, and therefore in safe hand:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) * prep_new_page() will be the gate keeper.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) * 2) it's part of a non-compound high order page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) * Implies some kernel user: cannot stop them from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) * R/W the page; let's pray that the page has been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) * used and will be freed some time later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) * In fact it's dangerous to directly bump up page count from 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) if (is_free_buddy_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) if (PageTransHuge(hpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) if (try_to_split_thp_page(p, "Memory Failure") < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) VM_BUG_ON_PAGE(!page_count(p), p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) * We ignore non-LRU pages for good reasons.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) * - PG_locked is only well defined for LRU pages and a few others
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) * - to avoid races with __SetPageLocked()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) * The check (unnecessarily) ignores LRU pages being isolated and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) * walked by the page reclaim code, however that's not a big loss.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) shake_page(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) /* shake_page could have turned it free. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) if (!PageLRU(p) && is_free_buddy_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) if (flags & MF_COUNT_INCREASED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) lock_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) * The page could have changed compound pages during the locking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) * If this happens just bail out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) if (PageCompound(p) && compound_head(p) != orig_head) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) res = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) * We use page flags to determine what action should be taken, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) * the flags can be modified by the error containment action. One
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) * example is an mlocked page, where PG_mlocked is cleared by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) * page_remove_rmap() in try_to_unmap_one(). So to determine page status
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) * correctly, we save a copy of the page flags at this time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) page_flags = p->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) * unpoison always clear PG_hwpoison inside page lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) if (!PageHWPoison(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) num_poisoned_pages_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) unlock_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) put_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) if (hwpoison_filter(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) if (TestClearPageHWPoison(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) num_poisoned_pages_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) unlock_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) put_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) * __munlock_pagevec may clear a writeback page's LRU flag without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) * page_lock. We need wait writeback completion for this page or it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) * may trigger vfs BUG while evict inode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) goto identify_page_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) * It's very difficult to mess with pages currently under IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) * and in many cases impossible, so we just avoid it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) wait_on_page_writeback(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) * Now take care of user space mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) * Abort on fail: __delete_from_page_cache() assumes unmapped page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) res = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) * Torn down by someone else?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) res = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) identify_page_state:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) res = identify_page_state(pfn, p, page_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) unlock_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) EXPORT_SYMBOL_GPL(memory_failure);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) #define MEMORY_FAILURE_FIFO_ORDER 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) struct memory_failure_entry {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) int flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) struct memory_failure_cpu {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) DECLARE_KFIFO(fifo, struct memory_failure_entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) MEMORY_FAILURE_FIFO_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) spinlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) struct work_struct work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) * memory_failure_queue - Schedule handling memory failure of a page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) * @pfn: Page Number of the corrupted page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) * @flags: Flags for memory failure handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) * This function is called by the low level hardware error handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) * when it detects hardware memory corruption of a page. It schedules
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) * the recovering of error page, including dropping pages, killing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) * processes etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) * The function is primarily of use for corruptions that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) * happen outside the current execution context (e.g. when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) * detected by a background scrubber)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) * Can run in IRQ context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) void memory_failure_queue(unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) struct memory_failure_cpu *mf_cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) unsigned long proc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) struct memory_failure_entry entry = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) .pfn = pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) .flags = flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) mf_cpu = &get_cpu_var(memory_failure_cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) spin_lock_irqsave(&mf_cpu->lock, proc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) if (kfifo_put(&mf_cpu->fifo, entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) schedule_work_on(smp_processor_id(), &mf_cpu->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) put_cpu_var(memory_failure_cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) EXPORT_SYMBOL_GPL(memory_failure_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) static void memory_failure_work_func(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) struct memory_failure_cpu *mf_cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) struct memory_failure_entry entry = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) unsigned long proc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) int gotten;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) mf_cpu = container_of(work, struct memory_failure_cpu, work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) spin_lock_irqsave(&mf_cpu->lock, proc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) gotten = kfifo_get(&mf_cpu->fifo, &entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) if (!gotten)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) if (entry.flags & MF_SOFT_OFFLINE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) soft_offline_page(entry.pfn, entry.flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) memory_failure(entry.pfn, entry.flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) * Process memory_failure work queued on the specified CPU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) * Used to avoid return-to-userspace racing with the memory_failure workqueue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) void memory_failure_queue_kick(int cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) struct memory_failure_cpu *mf_cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) mf_cpu = &per_cpu(memory_failure_cpu, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) cancel_work_sync(&mf_cpu->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) memory_failure_work_func(&mf_cpu->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) static int __init memory_failure_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) struct memory_failure_cpu *mf_cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) for_each_possible_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) mf_cpu = &per_cpu(memory_failure_cpu, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) spin_lock_init(&mf_cpu->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) INIT_KFIFO(mf_cpu->fifo);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) INIT_WORK(&mf_cpu->work, memory_failure_work_func);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) core_initcall(memory_failure_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) #define unpoison_pr_info(fmt, pfn, rs) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) ({ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) if (__ratelimit(rs)) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) pr_info(fmt, pfn); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) })
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) * unpoison_memory - Unpoison a previously poisoned page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) * @pfn: Page number of the to be unpoisoned page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) * Software-unpoison a page that has been poisoned by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) * memory_failure() earlier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) * This is only done on the software-level, so it only works
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) * for linux injected failures, not real hardware failures
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) * Returns 0 for success, otherwise -errno.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) int unpoison_memory(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) struct page *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) int freeit = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) DEFAULT_RATELIMIT_BURST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) if (!pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) return -ENXIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) p = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) page = compound_head(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) if (!PageHWPoison(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) if (page_count(page) > 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) if (page_mapped(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) if (page_mapping(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) * unpoison_memory() can encounter thp only when the thp is being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) * worked by memory_failure() and the page lock is not held yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) * In such case, we yield to memory_failure() and make unpoison fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) if (!PageHuge(page) && PageTransHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) if (!get_hwpoison_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) if (TestClearPageHWPoison(p))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) num_poisoned_pages_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) * This test is racy because PG_hwpoison is set outside of page lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) * That's acceptable because that won't trigger kernel panic. Instead,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) * the PG_hwpoison page will be caught and isolated on the entrance to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) * the free buddy page pool.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) if (TestClearPageHWPoison(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) pfn, &unpoison_rs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) num_poisoned_pages_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) freeit = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) EXPORT_SYMBOL(unpoison_memory);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) * Safely get reference count of an arbitrary page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) * Returns 0 for a free page, -EIO for a zero refcount page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) * that is not free, and 1 for any other page type.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) * For 1 the page is returned with increased page count, otherwise not.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) static int __get_any_page(struct page *p, unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) if (flags & MF_COUNT_INCREASED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) * When the target page is a free hugepage, just remove it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) * from free hugepage list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) if (!get_hwpoison_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) if (PageHuge(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) pr_info("%s: %#lx free huge page\n", __func__, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) } else if (is_free_buddy_page(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) pr_info("%s: %#lx free buddy page\n", __func__, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) } else if (page_count(p)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) /* raced with allocation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) __func__, pfn, p->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) ret = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) /* Not a free page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) static int get_any_page(struct page *page, unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) int ret = __get_any_page(page, pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) if (ret == -EBUSY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) ret = __get_any_page(page, pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) if (ret == 1 && !PageHuge(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) !PageLRU(page) && !__PageMovable(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) * Try to free it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) shake_page(page, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) * Did it turn free?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) ret = __get_any_page(page, pfn, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) if (ret == 1 && !PageLRU(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) /* Drop page reference which is from __get_any_page() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) pfn, page->flags, &page->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) static bool isolate_page(struct page *page, struct list_head *pagelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) bool isolated = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) bool lru = PageLRU(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) if (PageHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) isolated = isolate_huge_page(page, pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) if (lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) isolated = !isolate_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) if (isolated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) list_add(&page->lru, pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) if (isolated && lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) inc_node_page_state(page, NR_ISOLATED_ANON +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) page_is_file_lru(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) * If we succeed to isolate the page, we grabbed another refcount on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) * the page, so we can safely drop the one we got from get_any_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) * If we failed to isolate the page, it means that we cannot go further
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) * and we will return an error, so drop the reference we got from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) * get_any_pages() as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) return isolated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) * If the page is mapped, it migrates the contents over.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) static int __soft_offline_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) struct page *hpage = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) char const *msg_page[] = {"page", "hugepage"};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) bool huge = PageHuge(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) LIST_HEAD(pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) struct migration_target_control mtc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) .nid = NUMA_NO_NODE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) * Check PageHWPoison again inside page lock because PageHWPoison
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) * is set by memory_failure() outside page lock. Note that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) * memory_failure() also double-checks PageHWPoison inside page lock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) * so there's no race between soft_offline_page() and memory_failure().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) wait_on_page_writeback(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) if (PageHWPoison(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) pr_info("soft offline: %#lx page already poisoned\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) * Try to invalidate first. This should work for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) * non dirty unmapped page cache pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) ret = invalidate_inode_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) * RED-PEN would be better to keep it isolated here, but we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) * would need to fix isolation locking first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) pr_info("soft_offline: %#lx: invalidated\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) page_handle_poison(page, false, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) if (isolate_page(hpage, &pagelist)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) bool release = !huge;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) if (!page_handle_poison(page, huge, release))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) if (!list_empty(&pagelist))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) putback_movable_pages(&pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) pfn, msg_page[huge], ret, page->flags, &page->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) if (ret > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) static int soft_offline_in_use_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) struct page *hpage = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) if (!PageHuge(page) && PageTransHuge(hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) if (try_to_split_thp_page(page, "soft offline") < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) return __soft_offline_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) static int soft_offline_free_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) int rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) if (!page_handle_poison(page, true, false))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) rc = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) * soft_offline_page - Soft offline a page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) * @pfn: pfn to soft-offline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) * @flags: flags. Same as memory_failure().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) * Returns 0 on success, otherwise negated errno.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) * Soft offline a page, by migration or invalidation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) * without killing anything. This is for the case when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) * a page is not corrupted yet (so it's still valid to access),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) * but has had a number of corrected errors and is better taken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) * out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) * The actual policy on when to do that is maintained by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) * user space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) * This should never impact any application or cause data loss,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) * however it might take some time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) * This is not a 100% solution for all memory, but tries to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) * ``good enough'' for the majority of memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) int soft_offline_page(unsigned long pfn, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) bool try_again = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) if (!pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) return -ENXIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) page = pfn_to_online_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) if (PageHWPoison(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) pr_info("soft offline: %#lx page already poisoned\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) if (flags & MF_COUNT_INCREASED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) get_online_mems();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) ret = get_any_page(page, pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) put_online_mems();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) if (ret > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) ret = soft_offline_in_use_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) else if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) if (soft_offline_free_page(page) && try_again) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) try_again = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) flags &= ~MF_COUNT_INCREASED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) }