^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * fs/dax.c - Direct Access filesystem code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (c) 2013-2014 Intel Corporation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/atomic.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/buffer_head.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/genhd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/memcontrol.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/mutex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/uio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/vmstat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/pfn_t.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/sizes.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/iomap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <asm/pgalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <trace/events/fs_dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) static inline unsigned int pe_order(enum page_entry_size pe_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) if (pe_size == PE_SIZE_PTE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) return PAGE_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) if (pe_size == PE_SIZE_PMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) return PMD_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) if (pe_size == PE_SIZE_PUD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) return PUD_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) return ~0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) /* We choose 4096 entries - same as per-zone page wait tables */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #define DAX_WAIT_TABLE_BITS 12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) /* The 'colour' (ie low bits) within a PMD of a page offset. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) /* The order of a PMD entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) static int __init init_dax_wait_table(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) init_waitqueue_head(wait_table + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) fs_initcall(init_dax_wait_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * DAX pagecache entries use XArray value entries so they can't be mistaken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * for pages. We use one bit for locking, one bit for the entry size (PMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * and two more to tell us if the entry is a zero page or an empty entry that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * is just used for locking. In total four special bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * block allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #define DAX_SHIFT (4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) #define DAX_LOCKED (1UL << 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) #define DAX_PMD (1UL << 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #define DAX_ZERO_PAGE (1UL << 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) #define DAX_EMPTY (1UL << 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) static unsigned long dax_to_pfn(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) return xa_to_value(entry) >> DAX_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) static void *dax_make_entry(pfn_t pfn, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) static bool dax_is_locked(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) return xa_to_value(entry) & DAX_LOCKED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) static unsigned int dax_entry_order(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) if (xa_to_value(entry) & DAX_PMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) return PMD_ORDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) static unsigned long dax_is_pmd_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) return xa_to_value(entry) & DAX_PMD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) static bool dax_is_pte_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) return !(xa_to_value(entry) & DAX_PMD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) static int dax_is_zero_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) return xa_to_value(entry) & DAX_ZERO_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) static int dax_is_empty_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) return xa_to_value(entry) & DAX_EMPTY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) * true if the entry that was found is of a smaller order than the entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) * we were looking for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) static bool dax_is_conflict(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) return entry == XA_RETRY_ENTRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * DAX page cache entry locking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) struct exceptional_entry_key {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) struct xarray *xa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) pgoff_t entry_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) struct wait_exceptional_entry_queue {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) wait_queue_entry_t wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) struct exceptional_entry_key key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * enum dax_wake_mode: waitqueue wakeup behaviour
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * @WAKE_ALL: wake all waiters in the waitqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * @WAKE_NEXT: wake only the first waiter in the waitqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) enum dax_wake_mode {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) WAKE_ALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) WAKE_NEXT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) void *entry, struct exceptional_entry_key *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) unsigned long hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) unsigned long index = xas->xa_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) * If 'entry' is a PMD, align the 'index' that we use for the wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) * queue to the start of that PMD. This ensures that all offsets in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * the range covered by the PMD map to the same bit lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) if (dax_is_pmd_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) index &= ~PG_PMD_COLOUR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) key->xa = xas->xa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) key->entry_start = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) return wait_table + hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) unsigned int mode, int sync, void *keyp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) struct exceptional_entry_key *key = keyp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) struct wait_exceptional_entry_queue *ewait =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) container_of(wait, struct wait_exceptional_entry_queue, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) if (key->xa != ewait->key.xa ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) key->entry_start != ewait->key.entry_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) return autoremove_wake_function(wait, mode, sync, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) * @entry may no longer be the entry at the index in the mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) * The important information it's conveying is whether the entry at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) * this index used to be a PMD entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) static void dax_wake_entry(struct xa_state *xas, void *entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) enum dax_wake_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) struct exceptional_entry_key key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) wait_queue_head_t *wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) wq = dax_entry_waitqueue(xas, entry, &key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * Checking for locked entry and prepare_to_wait_exclusive() happens
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * under the i_pages lock, ditto for entry handling in our callers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * So at this point all tasks that could have seen our entry locked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * must be in the waitqueue and the following check will see them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) if (waitqueue_active(wq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) * Look up entry in page cache, wait for it to become unlocked if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) * is a DAX entry and return it. The caller must subsequently call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * if it did. The entry returned may have a larger order than @order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * If @order is larger than the order of the entry found in i_pages, this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * function returns a dax_is_conflict entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * Must be called with the i_pages lock held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) struct wait_exceptional_entry_queue ewait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) wait_queue_head_t *wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) init_wait(&ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) ewait.wait.func = wake_exceptional_entry_func;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) entry = xas_find_conflict(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) if (dax_entry_order(entry) < order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) return XA_RETRY_ENTRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (!dax_is_locked(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) wq = dax_entry_waitqueue(xas, entry, &ewait.key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) prepare_to_wait_exclusive(wq, &ewait.wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) TASK_UNINTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) finish_wait(wq, &ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * The only thing keeping the address space around is the i_pages lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) * (it's cycled in clear_inode() after removing the entries from i_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) * After we call xas_unlock_irq(), we cannot touch xas->xa.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) static void wait_entry_unlocked(struct xa_state *xas, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) struct wait_exceptional_entry_queue ewait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) wait_queue_head_t *wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) init_wait(&ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) ewait.wait.func = wake_exceptional_entry_func;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) wq = dax_entry_waitqueue(xas, entry, &ewait.key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * Unlike get_unlocked_entry() there is no guarantee that this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * path ever successfully retrieves an unlocked entry before an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * inode dies. Perform a non-exclusive wait in case this path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * never successfully performs its own wake up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) finish_wait(wq, &ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) static void put_unlocked_entry(struct xa_state *xas, void *entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) enum dax_wake_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (entry && !dax_is_conflict(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) dax_wake_entry(xas, entry, mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) * We used the xa_state to get the entry, but then we locked the entry and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) * dropped the xa_lock, so we know the xa_state is stale and must be reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) * before use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) static void dax_unlock_entry(struct xa_state *xas, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) void *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) BUG_ON(dax_is_locked(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) old = xas_store(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) BUG_ON(!dax_is_locked(old));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) dax_wake_entry(xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) * Return: The entry stored at this location before it was locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) static void *dax_lock_entry(struct xa_state *xas, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) unsigned long v = xa_to_value(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) static unsigned long dax_entry_size(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) if (dax_is_zero_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) else if (dax_is_empty_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) else if (dax_is_pmd_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) return PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) return PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) static unsigned long dax_end_pfn(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) * Iterate through all mapped pfns represented by an entry, i.e. skip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) * 'empty' and 'zero' entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) #define for_each_mapped_pfn(entry, pfn) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) for (pfn = dax_to_pfn(entry); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) pfn < dax_end_pfn(entry); pfn++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) * TODO: for reflink+dax we need a way to associate a single page with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) * multiple address_space instances at different linear_page_index()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) * offsets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) static void dax_associate_entry(void *entry, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) unsigned long size = dax_entry_size(entry), pfn, index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) int i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) index = linear_page_index(vma, address & ~(size - 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) for_each_mapped_pfn(entry, pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) WARN_ON_ONCE(page->mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) page->mapping = mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) page->index = index + i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) static void dax_disassociate_entry(void *entry, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) bool trunc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) for_each_mapped_pfn(entry, pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) WARN_ON_ONCE(page->mapping && page->mapping != mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) page->index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) static struct page *dax_busy_page(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) for_each_mapped_pfn(entry, pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) if (page_ref_count(page) > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * @page: The page whose entry we want to lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) * Context: Process context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) * not be locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) dax_entry_t dax_lock_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) XA_STATE(xas, NULL, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) /* Ensure page->mapping isn't freed while we look at it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) struct address_space *mapping = READ_ONCE(page->mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) entry = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) if (!mapping || !dax_mapping(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * In the device-dax case there's no need to lock, a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) * struct dev_pagemap pin is sufficient to keep the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) * inode alive, and we assume we have dev_pagemap pin
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) * otherwise we would not have a valid pfn_to_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) * translation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) entry = (void *)~0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) if (S_ISCHR(mapping->host->i_mode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) xas.xa = &mapping->i_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) if (mapping != page->mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) xas_set(&xas, page->index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) entry = xas_load(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) if (dax_is_locked(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) wait_entry_unlocked(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) dax_lock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) return (dax_entry_t)entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) void dax_unlock_page(struct page *page, dax_entry_t cookie)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) struct address_space *mapping = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) XA_STATE(xas, &mapping->i_pages, page->index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) if (S_ISCHR(mapping->host->i_mode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) dax_unlock_entry(&xas, (void *)cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * Find page cache entry at given index. If it is a DAX entry, return it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) * with the entry locked. If the page cache doesn't contain an entry at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) * that index, add a locked empty entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * either return that locked entry or will return VM_FAULT_FALLBACK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * This will happen if there are any PTE entries within the PMD range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * that we are requesting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * We always favor PTE entries over PMD entries. There isn't a flow where we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) * insertion will fail if it finds any PTE entries already in the tree, and a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) * PTE insertion will cause an existing PMD entry to be unmapped and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) * downgraded to PTE entries. This happens for both PMD zero pages as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) * well as PMD empty entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) * The exception to this downgrade path is for PMD entries that have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) * real storage backing them. We will leave these real PMD entries in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) * the tree, and PTE writes will simply dirty the entire PMD entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) * persistent memory the benefit is doubtful. We can add that later if we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) * show it helps.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) * On error, this function does not return an ERR_PTR. Instead it returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) * overlap with xarray value entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) static void *grab_mapping_entry(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) struct address_space *mapping, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) unsigned long index = xas->xa_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) pmd_downgrade = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) entry = get_unlocked_entry(xas, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) if (entry) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) if (dax_is_conflict(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) if (!xa_is_value(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) xas_set_err(xas, -EIO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) if (order == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) if (dax_is_pmd_entry(entry) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) (dax_is_zero_entry(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) dax_is_empty_entry(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) pmd_downgrade = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) if (pmd_downgrade) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) * Make sure 'entry' remains valid while we drop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) * the i_pages lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) * Besides huge zero pages the only other thing that gets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) * downgraded are empty entries which don't need to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) * unmapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) if (dax_is_zero_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) unmap_mapping_pages(mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) xas->xa_index & ~PG_PMD_COLOUR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) PG_PMD_NR, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) dax_disassociate_entry(entry, mapping, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) xas_store(xas, NULL); /* undo the PMD join */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) dax_wake_entry(xas, entry, WAKE_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) mapping->nrexceptional--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) entry = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) xas_set(xas, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) if (entry) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) unsigned long flags = DAX_EMPTY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) if (order > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) flags |= DAX_PMD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) entry = dax_make_entry(pfn_to_pfn_t(0), flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) if (xas_error(xas))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) mapping->nrexceptional++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) if (xas->xa_node == XA_ERROR(-ENOMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) return xa_mk_internal(VM_FAULT_OOM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) if (xas_error(xas))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) return xa_mk_internal(VM_FAULT_SIGBUS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) return xa_mk_internal(VM_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) * dax_layout_busy_page_range - find first pinned page in @mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) * @mapping: address space to scan for a page with ref count > 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) * @start: Starting offset. Page containing 'start' is included.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) * pages from 'start' till the end of file are included.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) * DAX requires ZONE_DEVICE mapped pages. These pages are never
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) * 'onlined' to the page allocator so they are considered idle when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) * page->count == 1. A filesystem uses this interface to determine if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) * any page in the mapping is busy, i.e. for DMA, or other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) * get_user_pages() usages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) * It is expected that the filesystem is holding locks to block the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) * establishment of new mappings in this address_space. I.e. it expects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) * to be able to run unmap_mapping_range() and subsequently not race
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) * mapping_mapped() becoming true.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) struct page *dax_layout_busy_page_range(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) loff_t start, loff_t end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) unsigned int scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) pgoff_t start_idx = start >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) pgoff_t end_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) XA_STATE(xas, &mapping->i_pages, start_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) * In the 'limited' case get_user_pages() for dax is disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) if (!dax_mapping(mapping) || !mapping_mapped(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) /* If end == LLONG_MAX, all pages from start to till end of file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) if (end == LLONG_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) end_idx = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) end_idx = end >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) * If we race get_user_pages_fast() here either we'll see the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * elevated page count in the iteration and wait, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * get_user_pages_fast() will see that the page it took a reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * against is no longer mapped in the page tables and bail to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * get_user_pages() slow path. The slow path is protected by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * pte_lock() and pmd_lock(). New references are not taken without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * holding those locks, and unmap_mapping_pages() will not zero the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * pte or pmd without holding the respective lock, so we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * guaranteed to either see new references or prevent new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) * references from being established.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) xas_for_each(&xas, entry, end_idx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) if (WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) if (unlikely(dax_is_locked(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) entry = get_unlocked_entry(&xas, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) if (entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) page = dax_busy_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) put_unlocked_entry(&xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) if (++scanned % XA_CHECK_SCHED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) struct page *dax_layout_busy_page(struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) EXPORT_SYMBOL_GPL(dax_layout_busy_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) static int __dax_invalidate_entry(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) pgoff_t index, bool trunc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) XA_STATE(xas, &mapping->i_pages, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) entry = get_unlocked_entry(&xas, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) if (!trunc &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) dax_disassociate_entry(entry, mapping, trunc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) xas_store(&xas, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) mapping->nrexceptional--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) put_unlocked_entry(&xas, entry, WAKE_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) * Delete DAX entry at @index from @mapping. Wait for it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) * to be unlocked before deleting it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) int ret = __dax_invalidate_entry(mapping, index, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * This gets called from truncate / punch_hole path. As such, the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) * must hold locks protecting against concurrent modifications of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) * page cache (usually fs-private i_mmap_sem for writing). Since the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) * caller has seen a DAX entry for this index, we better find it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) * at that index as well...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) WARN_ON_ONCE(!ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) * Invalidate DAX entry if it is clean.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) pgoff_t index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) return __dax_invalidate_entry(mapping, index, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) sector_t sector, struct page *to, unsigned long vaddr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) void *vto, *kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) long rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) int id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) if (rc < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) vto = kmap_atomic(to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) #ifdef CONFIG_ARM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) #ifndef copy_user_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) copy_user_page(vto, (void __force *)kaddr, vaddr, to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) kunmap_atomic(vto);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * By this point grab_mapping_entry() has ensured that we have a locked entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * of the appropriate size so we don't have to worry about downgrading PMDs to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) * PTEs. If we happen to be trying to insert a PTE and there is a PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) * already in the tree, we will skip the insertion and just dirty the PMD as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) * appropriate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) static void *dax_insert_entry(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) struct address_space *mapping, struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) void *entry, pfn_t pfn, unsigned long flags, bool dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) void *new_entry = dax_make_entry(pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) if (dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) unsigned long index = xas->xa_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) /* we are replacing a zero page with block mapping */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) if (dax_is_pmd_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) PG_PMD_NR, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) else /* pte entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) unmap_mapping_pages(mapping, index, 1, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) void *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) dax_disassociate_entry(entry, mapping, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) * Only swap our new entry into the page cache if the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) * entry is a zero page or an empty entry. If a normal PTE or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) * PMD entry is already in the cache, we leave it alone. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) * means that if we are trying to insert a PTE and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) * existing entry is a PMD, we will just leave the PMD in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) * tree and dirty it if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) old = dax_lock_entry(xas, new_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) DAX_LOCKED));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) entry = new_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) xas_load(xas); /* Walk the xa_state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) static inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) return address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) /* Walk all mappings of a given index of a file and writeprotect them */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) pte_t pte, *ptep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) pmd_t *pmdp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) if (!(vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) address = pgoff_address(index, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) * follow_invalidate_pte() will use the range to call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) * mmu_notifier_invalidate_range_start() on our behalf before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) * taking any lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) &pmdp, &ptl))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) * No need to call mmu_notifier_invalidate_range() as we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) * downgrading page table protection not changing it to point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) * to a new page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) if (pmdp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) #ifdef CONFIG_FS_DAX_PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) pmd_t pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) if (pfn != pmd_pfn(*pmdp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) goto unlock_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) goto unlock_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) flush_cache_page(vma, address, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) pmd = pmdp_invalidate(vma, address, pmdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) pmd = pmd_wrprotect(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) pmd = pmd_mkclean(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) set_pmd_at(vma->vm_mm, address, pmdp, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) unlock_pmd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) if (pfn != pte_pfn(*ptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) goto unlock_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) if (!pte_dirty(*ptep) && !pte_write(*ptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) goto unlock_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) flush_cache_page(vma, address, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) pte = ptep_clear_flush(vma, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) pte = pte_wrprotect(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) pte = pte_mkclean(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) set_pte_at(vma->vm_mm, address, ptep, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) unlock_pte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) pte_unmap_unlock(ptep, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) struct address_space *mapping, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) unsigned long pfn, index, count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) * A page got tagged dirty in DAX mapping? Something is seriously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) * wrong.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) if (WARN_ON(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) if (unlikely(dax_is_locked(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) void *old_entry = entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) entry = get_unlocked_entry(xas, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) /* Entry got punched out / reallocated? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) * Entry got reallocated elsewhere? No need to writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) * We have to compare pfns as we must not bail out due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) * difference in lockbit or entry type.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) dax_is_zero_entry(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) ret = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) /* Another fsync thread may have already done this entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) /* Lock the entry to serialize with page faults */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) * We can clear the tag now but we have to be careful so that concurrent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) * dax_writeback_one() calls for the same index cannot finish before we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) * actually flush the caches. This is achieved as the calls will look
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) * at the entry only under the i_pages lock and once they do that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) * they will see the entry locked and wait for it to unlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) * If dax_writeback_mapping_range() was given a wbc->range_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * in the middle of a PMD, the 'index' we use needs to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) * aligned to the start of the PMD.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) * This allows us to flush for PMD_SIZE and not have to worry about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) * partial PMD writebacks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) pfn = dax_to_pfn(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) count = 1UL << dax_entry_order(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) index = xas->xa_index & ~(count - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) dax_entry_mkclean(mapping, index, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) * After we have flushed the cache, we can clear the dirty tag. There
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * cannot be new dirty data in the pfn after the flush has completed as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * the pfn mappings are writeprotected and fault waits for mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * entry lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) xas_store(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) dax_wake_entry(xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) trace_dax_writeback_one(mapping->host, index, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) put_unlocked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) put_unlocked_entry(xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) * Flush the mapping to the persistent domain within the byte range of [start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) * end]. This is required by data integrity operations to ensure file data is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) * on persistent storage prior to completion of the operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) int dax_writeback_mapping_range(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) struct dax_device *dax_dev, struct writeback_control *wbc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) unsigned int scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) trace_dax_writeback_range(inode, xas.xa_index, end_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) tag_pages_for_writeback(mapping, xas.xa_index, end_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) mapping_set_error(mapping, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) if (++scanned % XA_CHECK_SCHED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) pfn_t *pfnp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) const sector_t sector = dax_iomap_sector(iomap, pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) int id, rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) long length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) NULL, pfnp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if (length < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) rc = length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) if (PFN_PHYS(length) < size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) /* For larger pages we need devmap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) if (length > 1 && !pfn_t_devmap(*pfnp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * The user has performed a load from a hole in the file. Allocating a new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) * page in the file would cause excessive storage usage for workloads with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) * sparse files. Instead we insert a read-only mapping of the 4k zero page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) * If this page is ever written to we will re-fault and change the mapping to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) * point to real DAX storage instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) static vm_fault_t dax_load_hole(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) struct address_space *mapping, void **entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) struct vm_fault *vmf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) unsigned long vaddr = vmf->address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) DAX_ZERO_PAGE, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) trace_dax_load_hole(inode, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) long rc, id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) void *kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) bool page_aligned = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) unsigned offset = offset_in_page(pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) unsigned size = min_t(u64, PAGE_SIZE - offset, length);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) (size == PAGE_SIZE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) page_aligned = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) if (page_aligned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) if (rc < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) if (!page_aligned) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) memset(kaddr + offset, 0, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) dax_flush(iomap->dax_dev, kaddr + offset, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) static loff_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) struct iomap *iomap, struct iomap *srcmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) struct block_device *bdev = iomap->bdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) struct dax_device *dax_dev = iomap->dax_dev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) struct iov_iter *iter = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) loff_t end = pos + length, done = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) ssize_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) size_t xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) int id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) if (iov_iter_rw(iter) == READ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) end = min(end, i_size_read(inode));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) if (pos >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) return iov_iter_zero(min(length, end - pos), iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) * Write can allocate block for an area which has a hole page mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * into page tables. We have to tear down these mappings so that data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) * written by write(2) is visible in mmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) if (iomap->flags & IOMAP_F_NEW) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) invalidate_inode_pages2_range(inode->i_mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) pos >> PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) (end - 1) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) while (pos < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) unsigned offset = pos & (PAGE_SIZE - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) const size_t size = ALIGN(length + offset, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) const sector_t sector = dax_iomap_sector(iomap, pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) ssize_t map_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) void *kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) if (fatal_signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) &kaddr, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) if (map_len < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) ret = map_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) map_len = PFN_PHYS(map_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) kaddr += offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) map_len -= offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) if (map_len > end - pos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) map_len = end - pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * The userspace address for the memory copy has already been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * validated via access_ok() in either vfs_read() or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) * vfs_write(), depending on which operation we are doing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) if (iov_iter_rw(iter) == WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) map_len, iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) map_len, iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) pos += xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) length -= xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) done += xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) if (xfer == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) if (xfer < map_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) return done ? done : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) * dax_iomap_rw - Perform I/O to a DAX file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) * @iocb: The control block for this I/O
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) * @iter: The addresses to do I/O from or to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) * @ops: iomap ops passed from the file system
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * This function performs read and write operations to directly mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * persistent memory. The callers needs to take care of read/write exclusion
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) * and evicting any page cache pages in the region under I/O.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) ssize_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) struct address_space *mapping = iocb->ki_filp->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) loff_t pos = iocb->ki_pos, ret = 0, done = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) unsigned flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) if (iov_iter_rw(iter) == WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) lockdep_assert_held_write(&inode->i_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) flags |= IOMAP_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) lockdep_assert_held(&inode->i_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) if (iocb->ki_flags & IOCB_NOWAIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) flags |= IOMAP_NOWAIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) while (iov_iter_count(iter)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) iter, dax_iomap_actor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) if (ret <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) pos += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) done += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) iocb->ki_pos += done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) return done ? done : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) EXPORT_SYMBOL_GPL(dax_iomap_rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) static vm_fault_t dax_fault_return(int error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) if (error == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) return vmf_error(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) * MAP_SYNC on a dax mapping guarantees dirty metadata is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) * flushed on write-faults (non-cow), but not read-faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) static bool dax_fault_is_synchronous(unsigned long flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) struct vm_area_struct *vma, struct iomap *iomap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) && (iomap->flags & IOMAP_F_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) int *iomap_errp, const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) unsigned long vaddr = vmf->address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) struct iomap iomap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) struct iomap srcmap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) unsigned flags = IOMAP_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) int error, major = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) bool write = vmf->flags & FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) bool sync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) vm_fault_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) trace_dax_pte_fault(inode, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) * Check whether offset isn't beyond end of file now. Caller is supposed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) * to hold locks serializing us with truncate / punch hole so this is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) * a reliable test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) if (pos >= i_size_read(inode)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) ret = VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) if (write && !vmf->cow_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) flags |= IOMAP_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) entry = grab_mapping_entry(&xas, mapping, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) if (xa_is_internal(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) ret = xa_to_internal(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) * It is possible, particularly with mixed reads & writes to private
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) * mappings, that we have raced with a PMD fault that overlaps with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) * the PTE we need to set up. If so just return and the fault will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) * retried.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) ret = VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) * Note that we don't bother to use iomap_apply here: DAX required
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) * the file system block size to be equal the page size, which means
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) * that we never have to deal with more than a single extent here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) if (iomap_errp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) *iomap_errp = error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) ret = dax_fault_return(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) error = -EIO; /* fs corruption? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) if (vmf->cow_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) sector_t sector = dax_iomap_sector(&iomap, pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) switch (iomap.type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) case IOMAP_HOLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) case IOMAP_UNWRITTEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) clear_user_highpage(vmf->cow_page, vaddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) case IOMAP_MAPPED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) sector, vmf->cow_page, vaddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) __SetPageUptodate(vmf->cow_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) ret = finish_fault(vmf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) ret = VM_FAULT_DONE_COW;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) sync = dax_fault_is_synchronous(flags, vma, &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) switch (iomap.type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) case IOMAP_MAPPED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) if (iomap.flags & IOMAP_F_NEW) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) count_vm_event(PGMAJFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) major = VM_FAULT_MAJOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) if (error < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) 0, write && !sync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) * If we are doing synchronous page fault and inode needs fsync,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) * we can insert PTE into page tables only after that happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) * Skip insertion for now and return the pfn so that caller can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) * insert it after fsync is done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) if (sync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) if (WARN_ON_ONCE(!pfnp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) *pfnp = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) ret = VM_FAULT_NEEDDSYNC | major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) trace_dax_insert_mapping(inode, vmf, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) ret = vmf_insert_mixed(vma, vaddr, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) case IOMAP_UNWRITTEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) case IOMAP_HOLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) if (!write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) ret = dax_load_hole(&xas, mapping, &entry, vmf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) fallthrough;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) error_finish_iomap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) ret = dax_fault_return(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) finish_iomap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) if (ops->iomap_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) int copied = PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) if (ret & VM_FAULT_ERROR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) * The fault is done by now and there's no way back (other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) * thread may be already happily using PTE we have installed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) * Just ignore error from ->iomap_end since we cannot do much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) * with it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) unlock_entry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) dax_unlock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) trace_dax_pte_fault_done(inode, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) return ret | major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) #ifdef CONFIG_FS_DAX_PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) struct iomap *iomap, void **entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) struct address_space *mapping = vmf->vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) unsigned long pmd_addr = vmf->address & PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) pgtable_t pgtable = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) struct page *zero_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) pmd_t pmd_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) if (unlikely(!zero_page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) pfn = page_to_pfn_t(zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) DAX_PMD | DAX_ZERO_PAGE, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) if (arch_needs_pgtable_deposit()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) pgtable = pte_alloc_one(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) if (!pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) if (!pmd_none(*(vmf->pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) if (pgtable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) mm_inc_nr_ptes(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) pmd_entry = pmd_mkhuge(pmd_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) if (pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) unsigned long pmd_addr = vmf->address & PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) bool write = vmf->flags & FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) bool sync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) vm_fault_t result = VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) struct iomap iomap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) struct iomap srcmap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) pgoff_t max_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) loff_t pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) * Check whether offset isn't beyond end of file now. Caller is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) * supposed to hold locks serializing us with truncate / punch hole so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) * this is a reliable test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) * Make sure that the faulting address's PMD offset (color) matches
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) * the PMD offset from the start of the file. This is necessary so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) * that a PMD range in the page table overlaps exactly with a PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) * range in the page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) if ((vmf->pgoff & PG_PMD_COLOUR) !=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) /* Fall back to PTEs if we're going to COW */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) if (write && !(vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) /* If the PMD would extend outside the VMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) if (pmd_addr < vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) if ((pmd_addr + PMD_SIZE) > vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) if (xas.xa_index >= max_pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) result = VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) /* If the PMD would extend beyond the file size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) * grab_mapping_entry() will make sure we get an empty PMD entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) * a zero PMD entry or a DAX PMD. If it can't (because a PTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) * entry is already in the array, for instance), it will return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) * VM_FAULT_FALLBACK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) if (xa_is_internal(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) result = xa_to_internal(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) * It is possible, particularly with mixed reads & writes to private
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) * mappings, that we have raced with a PTE fault that overlaps with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) * the PMD we need to set up. If so just return and the fault will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) * retried.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) !pmd_devmap(*vmf->pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) result = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) * Note that we don't use iomap_apply here. We aren't doing I/O, only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) * setting up a mapping, so really we're using iomap_begin() as a way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) * to look up our filesystem block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) pos = (loff_t)xas.xa_index << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) &srcmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) if (iomap.offset + iomap.length < pos + PMD_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) switch (iomap.type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) case IOMAP_MAPPED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) if (error < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) DAX_PMD, write && !sync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) * If we are doing synchronous page fault and inode needs fsync,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) * we can insert PMD into page tables only after that happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) * Skip insertion for now and return the pfn so that caller can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) * insert it after fsync is done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) if (sync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) if (WARN_ON_ONCE(!pfnp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) *pfnp = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) result = VM_FAULT_NEEDDSYNC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) result = vmf_insert_pfn_pmd(vmf, pfn, write);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) case IOMAP_UNWRITTEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) case IOMAP_HOLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) if (WARN_ON_ONCE(write))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) finish_iomap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) if (ops->iomap_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) int copied = PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) if (result == VM_FAULT_FALLBACK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) * The fault is done by now and there's no way back (other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) * thread may be already happily using PMD we have installed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) * Just ignore error from ->iomap_end since we cannot do much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) * with it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) unlock_entry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) dax_unlock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) if (result == VM_FAULT_FALLBACK) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) split_huge_pmd(vma, vmf->pmd, vmf->address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) count_vm_event(THP_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) return result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) #endif /* CONFIG_FS_DAX_PMD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) * dax_iomap_fault - handle a page fault on a DAX file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) * @vmf: The description of the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) * @pe_size: Size of the page to fault in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) * @pfnp: PFN to insert for synchronous faults if fsync is required
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) * @iomap_errp: Storage for detailed error code in case of error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) * @ops: Iomap ops passed from the file system
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) * When a page fault occurs, filesystems may call this helper in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) * their fault handler for DAX files. dax_iomap_fault() assumes the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) * has done all the necessary locking for page fault to proceed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) * successfully.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) switch (pe_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) case PE_SIZE_PTE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) case PE_SIZE_PMD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) return dax_iomap_pmd_fault(vmf, pfnp, ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) EXPORT_SYMBOL_GPL(dax_iomap_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) * @vmf: The description of the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) * @pfn: PFN to insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) * @order: Order of entry to insert.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) * This function inserts a writeable PTE or PMD entry into the page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) * for an mmaped DAX file. It also marks the page cache entry as dirty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) static vm_fault_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) struct address_space *mapping = vmf->vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) entry = get_unlocked_entry(&xas, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) /* Did we race with someone splitting entry or so? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) if (!entry || dax_is_conflict(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) (order == 0 && !dax_is_pte_entry(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) put_unlocked_entry(&xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) VM_FAULT_NOPAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) dax_lock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) if (order == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) #ifdef CONFIG_FS_DAX_PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) else if (order == PMD_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) ret = VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) dax_unlock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) * dax_finish_sync_fault - finish synchronous page fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) * @vmf: The description of the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) * @pe_size: Size of entry to be inserted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) * @pfn: PFN to insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) * This function ensures that the file range touched by the page fault is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) * stored persistently on the media and handles inserting of appropriate page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) * table entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) enum page_entry_size pe_size, pfn_t pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) unsigned int order = pe_order(pe_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) size_t len = PAGE_SIZE << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) return VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) return dax_insert_pfn_mkwrite(vmf, pfn, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) EXPORT_SYMBOL_GPL(dax_finish_sync_fault);