Orange Pi5 kernel

^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  * fs/dax.c - Direct Access filesystem code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  * Copyright (c) 2013-2014 Intel Corporation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6)  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9) #include <linux/atomic.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11) #include <linux/buffer_head.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) #include <linux/dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14) #include <linux/genhd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16) #include <linux/memcontrol.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) #include <linux/mutex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) #include <linux/uio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) #include <linux/vmstat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) #include <linux/pfn_t.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #include <linux/sizes.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) #include <linux/iomap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) #include <asm/pgalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) #include <trace/events/fs_dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) static inline unsigned int pe_order(enum page_entry_size pe_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) 	if (pe_size == PE_SIZE_PTE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) 		return PAGE_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) 	if (pe_size == PE_SIZE_PMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) 		return PMD_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) 	if (pe_size == PE_SIZE_PUD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) 		return PUD_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) 	return ~0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) /* We choose 4096 entries - same as per-zone page wait tables */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) #define DAX_WAIT_TABLE_BITS 12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) /* The 'colour' (ie low bits) within a PMD of a page offset.  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) /* The order of a PMD entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) #define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) static int __init init_dax_wait_table(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) 	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) 		init_waitqueue_head(wait_table + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) fs_initcall(init_dax_wait_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68)  * DAX pagecache entries use XArray value entries so they can't be mistaken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69)  * for pages.  We use one bit for locking, one bit for the entry size (PMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70)  * and two more to tell us if the entry is a zero page or an empty entry that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71)  * is just used for locking.  In total four special bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73)  * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74)  * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75)  * block allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) #define DAX_SHIFT	(4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) #define DAX_LOCKED	(1UL << 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) #define DAX_PMD		(1UL << 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) #define DAX_ZERO_PAGE	(1UL << 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) #define DAX_EMPTY	(1UL << 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) static unsigned long dax_to_pfn(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 	return xa_to_value(entry) >> DAX_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) static void *dax_make_entry(pfn_t pfn, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) static bool dax_is_locked(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 	return xa_to_value(entry) & DAX_LOCKED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) static unsigned int dax_entry_order(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 	if (xa_to_value(entry) & DAX_PMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 		return PMD_ORDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) static unsigned long dax_is_pmd_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 	return xa_to_value(entry) & DAX_PMD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) static bool dax_is_pte_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) 	return !(xa_to_value(entry) & DAX_PMD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) static int dax_is_zero_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 	return xa_to_value(entry) & DAX_ZERO_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) static int dax_is_empty_entry(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) 	return xa_to_value(entry) & DAX_EMPTY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126)  * true if the entry that was found is of a smaller order than the entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127)  * we were looking for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) static bool dax_is_conflict(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) 	return entry == XA_RETRY_ENTRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135)  * DAX page cache entry locking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) struct exceptional_entry_key {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) 	struct xarray *xa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) 	pgoff_t entry_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) struct wait_exceptional_entry_queue {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	wait_queue_entry_t wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	struct exceptional_entry_key key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148)  * enum dax_wake_mode: waitqueue wakeup behaviour
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149)  * @WAKE_ALL: wake all waiters in the waitqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150)  * @WAKE_NEXT: wake only the first waiter in the waitqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) enum dax_wake_mode {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) 	WAKE_ALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) 	WAKE_NEXT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) 		void *entry, struct exceptional_entry_key *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) 	unsigned long hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) 	unsigned long index = xas->xa_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	 * If 'entry' is a PMD, align the 'index' that we use for the wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) 	 * queue to the start of that PMD.  This ensures that all offsets in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) 	 * the range covered by the PMD map to the same bit lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) 	if (dax_is_pmd_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 		index &= ~PG_PMD_COLOUR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) 	key->xa = xas->xa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 	key->entry_start = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) 	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) 	return wait_table + hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) 		unsigned int mode, int sync, void *keyp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 	struct exceptional_entry_key *key = keyp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) 	struct wait_exceptional_entry_queue *ewait =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 		container_of(wait, struct wait_exceptional_entry_queue, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) 	if (key->xa != ewait->key.xa ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) 	    key->entry_start != ewait->key.entry_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) 	return autoremove_wake_function(wait, mode, sync, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191)  * @entry may no longer be the entry at the index in the mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192)  * The important information it's conveying is whether the entry at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193)  * this index used to be a PMD entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) static void dax_wake_entry(struct xa_state *xas, void *entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 			   enum dax_wake_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) 	struct exceptional_entry_key key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) 	wait_queue_head_t *wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 	wq = dax_entry_waitqueue(xas, entry, &key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 	 * under the i_pages lock, ditto for entry handling in our callers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 	 * So at this point all tasks that could have seen our entry locked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 	 * must be in the waitqueue and the following check will see them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	if (waitqueue_active(wq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) 		__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214)  * Look up entry in page cache, wait for it to become unlocked if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215)  * is a DAX entry and return it.  The caller must subsequently call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216)  * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217)  * if it did.  The entry returned may have a larger order than @order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218)  * If @order is larger than the order of the entry found in i_pages, this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219)  * function returns a dax_is_conflict entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221)  * Must be called with the i_pages lock held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) 	struct wait_exceptional_entry_queue ewait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) 	wait_queue_head_t *wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) 	init_wait(&ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	ewait.wait.func = wake_exceptional_entry_func;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) 	for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) 		entry = xas_find_conflict(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) 			return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) 		if (dax_entry_order(entry) < order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) 			return XA_RETRY_ENTRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) 		if (!dax_is_locked(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) 			return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) 		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) 		prepare_to_wait_exclusive(wq, &ewait.wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) 					  TASK_UNINTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) 		xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) 		xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246) 		schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) 		finish_wait(wq, &ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) 		xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253)  * The only thing keeping the address space around is the i_pages lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254)  * (it's cycled in clear_inode() after removing the entries from i_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255)  * After we call xas_unlock_irq(), we cannot touch xas->xa.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) static void wait_entry_unlocked(struct xa_state *xas, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 	struct wait_exceptional_entry_queue ewait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	wait_queue_head_t *wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 	init_wait(&ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) 	ewait.wait.func = wake_exceptional_entry_func;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) 	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) 	 * Unlike get_unlocked_entry() there is no guarantee that this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) 	 * path ever successfully retrieves an unlocked entry before an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) 	 * inode dies. Perform a non-exclusive wait in case this path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 	 * never successfully performs its own wake up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) 	xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) 	schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	finish_wait(wq, &ewait.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) static void put_unlocked_entry(struct xa_state *xas, void *entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 			       enum dax_wake_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) 	if (entry && !dax_is_conflict(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 		dax_wake_entry(xas, entry, mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286)  * We used the xa_state to get the entry, but then we locked the entry and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287)  * dropped the xa_lock, so we know the xa_state is stale and must be reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288)  * before use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) static void dax_unlock_entry(struct xa_state *xas, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) 	void *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 	BUG_ON(dax_is_locked(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 	xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) 	xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) 	old = xas_store(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 	xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) 	BUG_ON(!dax_is_locked(old));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 	dax_wake_entry(xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304)  * Return: The entry stored at this location before it was locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) static void *dax_lock_entry(struct xa_state *xas, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	unsigned long v = xa_to_value(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) static unsigned long dax_entry_size(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 	if (dax_is_zero_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 	else if (dax_is_empty_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 	else if (dax_is_pmd_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 		return PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 		return PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) static unsigned long dax_end_pfn(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330)  * Iterate through all mapped pfns represented by an entry, i.e. skip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331)  * 'empty' and 'zero' entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) #define for_each_mapped_pfn(entry, pfn) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 	for (pfn = dax_to_pfn(entry); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 			pfn < dax_end_pfn(entry); pfn++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338)  * TODO: for reflink+dax we need a way to associate a single page with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339)  * multiple address_space instances at different linear_page_index()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340)  * offsets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) static void dax_associate_entry(void *entry, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 		struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	unsigned long size = dax_entry_size(entry), pfn, index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 	int i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	index = linear_page_index(vma, address & ~(size - 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	for_each_mapped_pfn(entry, pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 		struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 		WARN_ON_ONCE(page->mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 		page->mapping = mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 		page->index = index + i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) static void dax_disassociate_entry(void *entry, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 		bool trunc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) 	for_each_mapped_pfn(entry, pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) 		struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 		page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) 		page->index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) static struct page *dax_busy_page(void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 	unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 	for_each_mapped_pfn(entry, pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 		struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 		if (page_ref_count(page) > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 			return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393)  * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394)  * @page: The page whose entry we want to lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396)  * Context: Process context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397)  * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398)  * not be locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) dax_entry_t dax_lock_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 	XA_STATE(xas, NULL, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 	/* Ensure page->mapping isn't freed while we look at it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 	rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 	for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 		struct address_space *mapping = READ_ONCE(page->mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 		entry = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 		if (!mapping || !dax_mapping(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 		 * In the device-dax case there's no need to lock, a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) 		 * struct dev_pagemap pin is sufficient to keep the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 		 * inode alive, and we assume we have dev_pagemap pin
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 		 * otherwise we would not have a valid pfn_to_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 		 * translation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 		entry = (void *)~0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 		if (S_ISCHR(mapping->host->i_mode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 		xas.xa = &mapping->i_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 		xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 		if (mapping != page->mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 			xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 		xas_set(&xas, page->index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 		entry = xas_load(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 		if (dax_is_locked(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 			rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 			wait_entry_unlocked(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 			rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 		dax_lock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 		xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 	rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	return (dax_entry_t)entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) void dax_unlock_page(struct page *page, dax_entry_t cookie)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 	struct address_space *mapping = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	XA_STATE(xas, &mapping->i_pages, page->index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	if (S_ISCHR(mapping->host->i_mode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	dax_unlock_entry(&xas, (void *)cookie);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459)  * Find page cache entry at given index. If it is a DAX entry, return it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460)  * with the entry locked. If the page cache doesn't contain an entry at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461)  * that index, add a locked empty entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463)  * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464)  * either return that locked entry or will return VM_FAULT_FALLBACK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465)  * This will happen if there are any PTE entries within the PMD range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466)  * that we are requesting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468)  * We always favor PTE entries over PMD entries. There isn't a flow where we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469)  * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470)  * insertion will fail if it finds any PTE entries already in the tree, and a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471)  * PTE insertion will cause an existing PMD entry to be unmapped and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472)  * downgraded to PTE entries.  This happens for both PMD zero pages as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473)  * well as PMD empty entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475)  * The exception to this downgrade path is for PMD entries that have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476)  * real storage backing them.  We will leave these real PMD entries in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477)  * the tree, and PTE writes will simply dirty the entire PMD entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479)  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480)  * persistent memory the benefit is doubtful. We can add that later if we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481)  * show it helps.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483)  * On error, this function does not return an ERR_PTR.  Instead it returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484)  * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485)  * overlap with xarray value entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) static void *grab_mapping_entry(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 		struct address_space *mapping, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 	unsigned long index = xas->xa_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) 	bool pmd_downgrade;	/* splitting PMD entry into PTE entries? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 	pmd_downgrade = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) 	xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) 	entry = get_unlocked_entry(xas, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) 	if (entry) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 		if (dax_is_conflict(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 			goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 		if (!xa_is_value(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) 			xas_set_err(xas, -EIO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 		if (order == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 			if (dax_is_pmd_entry(entry) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 			    (dax_is_zero_entry(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 			     dax_is_empty_entry(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 				pmd_downgrade = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 	if (pmd_downgrade) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 		 * Make sure 'entry' remains valid while we drop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 		 * the i_pages lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 		dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 		 * Besides huge zero pages the only other thing that gets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 		 * downgraded are empty entries which don't need to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 		 * unmapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 		if (dax_is_zero_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 			xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 			unmap_mapping_pages(mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 					xas->xa_index & ~PG_PMD_COLOUR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) 					PG_PMD_NR, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 			xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) 			xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 		dax_disassociate_entry(entry, mapping, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 		xas_store(xas, NULL);	/* undo the PMD join */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 		dax_wake_entry(xas, entry, WAKE_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 		mapping->nrexceptional--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 		entry = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 		xas_set(xas, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 	if (entry) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 		dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 		unsigned long flags = DAX_EMPTY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 		if (order > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 			flags |= DAX_PMD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 		dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 		if (xas_error(xas))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 		mapping->nrexceptional++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 		goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 	if (xas->xa_node == XA_ERROR(-ENOMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 		return xa_mk_internal(VM_FAULT_OOM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 	if (xas_error(xas))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 		return xa_mk_internal(VM_FAULT_SIGBUS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 	return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 	xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 	return xa_mk_internal(VM_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574)  * dax_layout_busy_page_range - find first pinned page in @mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575)  * @mapping: address space to scan for a page with ref count > 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576)  * @start: Starting offset. Page containing 'start' is included.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577)  * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578)  *       pages from 'start' till the end of file are included.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580)  * DAX requires ZONE_DEVICE mapped pages. These pages are never
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581)  * 'onlined' to the page allocator so they are considered idle when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582)  * page->count == 1. A filesystem uses this interface to determine if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583)  * any page in the mapping is busy, i.e. for DMA, or other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584)  * get_user_pages() usages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586)  * It is expected that the filesystem is holding locks to block the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587)  * establishment of new mappings in this address_space. I.e. it expects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588)  * to be able to run unmap_mapping_range() and subsequently not race
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589)  * mapping_mapped() becoming true.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) struct page *dax_layout_busy_page_range(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 					loff_t start, loff_t end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 	unsigned int scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 	struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 	pgoff_t start_idx = start >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 	pgoff_t end_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 	XA_STATE(xas, &mapping->i_pages, start_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) 	 * In the 'limited' case get_user_pages() for dax is disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) 	/* If end == LLONG_MAX, all pages from start to till end of file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) 	if (end == LLONG_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) 		end_idx = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) 		end_idx = end >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 	 * If we race get_user_pages_fast() here either we'll see the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 	 * elevated page count in the iteration and wait, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 	 * get_user_pages_fast() will see that the page it took a reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 	 * against is no longer mapped in the page tables and bail to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 	 * get_user_pages() slow path.  The slow path is protected by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 	 * pte_lock() and pmd_lock(). New references are not taken without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 	 * holding those locks, and unmap_mapping_pages() will not zero the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 	 * pte or pmd without holding the respective lock, so we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) 	 * guaranteed to either see new references or prevent new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 	 * references from being established.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) 	xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) 	xas_for_each(&xas, entry, end_idx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) 		if (WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) 		if (unlikely(dax_is_locked(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) 			entry = get_unlocked_entry(&xas, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) 		if (entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 			page = dax_busy_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 		put_unlocked_entry(&xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) 		if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) 		if (++scanned % XA_CHECK_SCHED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643) 		xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644) 		xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646) 		xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648) 	xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651) EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) struct page *dax_layout_busy_page(struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) 	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) EXPORT_SYMBOL_GPL(dax_layout_busy_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) static int __dax_invalidate_entry(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 					  pgoff_t index, bool trunc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 	XA_STATE(xas, &mapping->i_pages, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 	xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 	entry = get_unlocked_entry(&xas, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 	if (!trunc &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 	dax_disassociate_entry(entry, mapping, trunc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 	xas_store(&xas, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 	mapping->nrexceptional--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 	ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 	put_unlocked_entry(&xas, entry, WAKE_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) 	xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685)  * Delete DAX entry at @index from @mapping.  Wait for it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686)  * to be unlocked before deleting it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) 	int ret = __dax_invalidate_entry(mapping, index, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) 	 * This gets called from truncate / punch_hole path. As such, the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 	 * must hold locks protecting against concurrent modifications of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) 	 * page cache (usually fs-private i_mmap_sem for writing). Since the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) 	 * caller has seen a DAX entry for this index, we better find it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 	 * at that index as well...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 	WARN_ON_ONCE(!ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704)  * Invalidate DAX entry if it is clean.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 				      pgoff_t index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 	return __dax_invalidate_entry(mapping, index, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 			     sector_t sector, struct page *to, unsigned long vaddr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 	void *vto, *kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 	pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) 	long rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 	int id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 	rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) 	if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) 		return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 	id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) 	if (rc < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) 		dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) 		return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 	vto = kmap_atomic(to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) #ifdef CONFIG_ARM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) #ifndef copy_user_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) 	kunmap_atomic(vto);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 	dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743)  * By this point grab_mapping_entry() has ensured that we have a locked entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744)  * of the appropriate size so we don't have to worry about downgrading PMDs to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745)  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746)  * already in the tree, we will skip the insertion and just dirty the PMD as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747)  * appropriate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) static void *dax_insert_entry(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 		struct address_space *mapping, struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 		void *entry, pfn_t pfn, unsigned long flags, bool dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 	void *new_entry = dax_make_entry(pfn, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 	if (dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 		unsigned long index = xas->xa_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 		/* we are replacing a zero page with block mapping */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 		if (dax_is_pmd_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 					PG_PMD_NR, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 		else /* pte entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 			unmap_mapping_pages(mapping, index, 1, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 	xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 	xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 		void *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 		dax_disassociate_entry(entry, mapping, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 		 * Only swap our new entry into the page cache if the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 		 * entry is a zero page or an empty entry.  If a normal PTE or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 		 * PMD entry is already in the cache, we leave it alone.  This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 		 * means that if we are trying to insert a PTE and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 		 * existing entry is a PMD, we will just leave the PMD in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 		 * tree and dirty it if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 		old = dax_lock_entry(xas, new_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 					DAX_LOCKED));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 		entry = new_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 		xas_load(xas);	/* Walk the xa_state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 	if (dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 	xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 	return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) static inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 	return address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) /* Walk all mappings of a given index of a file and writeprotect them */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) 		unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) 	struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) 	pte_t pte, *ptep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 	pmd_t *pmdp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 		struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 		unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 		if (!(vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) 		address = pgoff_address(index, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) 		 * follow_invalidate_pte() will use the range to call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) 		 * mmu_notifier_invalidate_range_start() on our behalf before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 		 * taking any lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 		if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) 					  &pmdp, &ptl))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 		 * No need to call mmu_notifier_invalidate_range() as we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) 		 * downgrading page table protection not changing it to point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 		 * to a new page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 		 * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 		if (pmdp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) #ifdef CONFIG_FS_DAX_PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 			pmd_t pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 			if (pfn != pmd_pfn(*pmdp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 				goto unlock_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 				goto unlock_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 			flush_cache_page(vma, address, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 			pmd = pmdp_invalidate(vma, address, pmdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 			pmd = pmd_wrprotect(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 			pmd = pmd_mkclean(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) unlock_pmd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 			if (pfn != pte_pfn(*ptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 				goto unlock_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 			if (!pte_dirty(*ptep) && !pte_write(*ptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 				goto unlock_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 			flush_cache_page(vma, address, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 			pte = ptep_clear_flush(vma, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 			pte = pte_wrprotect(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 			pte = pte_mkclean(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 			set_pte_at(vma->vm_mm, address, ptep, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) unlock_pte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 			pte_unmap_unlock(ptep, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 		mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 		struct address_space *mapping, void *entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 	unsigned long pfn, index, count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 	long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	 * A page got tagged dirty in DAX mapping? Something is seriously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 	 * wrong.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 	if (WARN_ON(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 		return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 	if (unlikely(dax_is_locked(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 		void *old_entry = entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 		entry = get_unlocked_entry(xas, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 		/* Entry got punched out / reallocated? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 			goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 		 * Entry got reallocated elsewhere? No need to writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 		 * We have to compare pfns as we must not bail out due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 		 * difference in lockbit or entry type.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 			goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 					dax_is_zero_entry(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 			ret = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 			goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 		/* Another fsync thread may have already done this entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 			goto put_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 	/* Lock the entry to serialize with page faults */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 	dax_lock_entry(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 	 * We can clear the tag now but we have to be careful so that concurrent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 	 * dax_writeback_one() calls for the same index cannot finish before we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 	 * actually flush the caches. This is achieved as the calls will look
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	 * at the entry only under the i_pages lock and once they do that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 	 * they will see the entry locked and wait for it to unlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) 	xas_unlock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 	 * If dax_writeback_mapping_range() was given a wbc->range_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 	 * in the middle of a PMD, the 'index' we use needs to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 	 * aligned to the start of the PMD.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 	 * This allows us to flush for PMD_SIZE and not have to worry about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	 * partial PMD writebacks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 	pfn = dax_to_pfn(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 	count = 1UL << dax_entry_order(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 	index = xas->xa_index & ~(count - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 	dax_entry_mkclean(mapping, index, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	 * After we have flushed the cache, we can clear the dirty tag. There
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	 * cannot be new dirty data in the pfn after the flush has completed as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 	 * the pfn mappings are writeprotected and fault waits for mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	 * entry lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	xas_reset(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	xas_lock_irq(xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 	xas_store(xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 	dax_wake_entry(xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	trace_dax_writeback_one(mapping->host, index, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962)  put_unlocked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 	put_unlocked_entry(xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968)  * Flush the mapping to the persistent domain within the byte range of [start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969)  * end]. This is required by data integrity operations to ensure file data is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970)  * on persistent storage prior to completion of the operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) int dax_writeback_mapping_range(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 		struct dax_device *dax_dev, struct writeback_control *wbc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 	int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 	unsigned int scanned = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 		return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 	trace_dax_writeback_range(inode, xas.xa_index, end_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) 	tag_pages_for_writeback(mapping, xas.xa_index, end_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 	xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 		ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 		if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 			mapping_set_error(mapping, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 		if (++scanned % XA_CHECK_SCHED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) 		xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 		xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 		xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 	xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 	trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 			 pfn_t *pfnp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 	const sector_t sector = dax_iomap_sector(iomap, pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 	pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 	int id, rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) 	long length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 	if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 		return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 	id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 				   NULL, pfnp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) 	if (length < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 		rc = length;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 	if (PFN_PHYS(length) < size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 	/* For larger pages we need devmap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) 	if (length > 1 && !pfn_t_devmap(*pfnp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 	rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 	dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 	return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051)  * The user has performed a load from a hole in the file.  Allocating a new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052)  * page in the file would cause excessive storage usage for workloads with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053)  * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054)  * If this page is ever written to we will re-fault and change the mapping to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055)  * point to real DAX storage instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) static vm_fault_t dax_load_hole(struct xa_state *xas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 		struct address_space *mapping, void **entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 		struct vm_fault *vmf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	unsigned long vaddr = vmf->address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 	vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 			DAX_ZERO_PAGE, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 	trace_dax_load_hole(inode, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 	sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 	pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	long rc, id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 	void *kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 	bool page_aligned = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 	unsigned offset = offset_in_page(pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 	unsigned size = min_t(u64, PAGE_SIZE - offset, length);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 	if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 	    (size == PAGE_SIZE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 		page_aligned = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 	rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 	if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 		return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 	id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 	if (page_aligned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 		rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) 		rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) 	if (rc < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) 		dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 		return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 	if (!page_aligned) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 		memset(kaddr + offset, 0, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 		dax_flush(iomap->dax_dev, kaddr + offset, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 	dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 	return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) static loff_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 		struct iomap *iomap, struct iomap *srcmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 	struct block_device *bdev = iomap->bdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 	struct dax_device *dax_dev = iomap->dax_dev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 	struct iov_iter *iter = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 	loff_t end = pos + length, done = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 	ssize_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 	size_t xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 	int id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) 	if (iov_iter_rw(iter) == READ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 		end = min(end, i_size_read(inode));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 		if (pos >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 			return iov_iter_zero(min(length, end - pos), iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 		return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 	 * Write can allocate block for an area which has a hole page mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 	 * into page tables. We have to tear down these mappings so that data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 	 * written by write(2) is visible in mmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 	if (iomap->flags & IOMAP_F_NEW) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 		invalidate_inode_pages2_range(inode->i_mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 					      pos >> PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 					      (end - 1) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 	id = dax_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 	while (pos < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 		unsigned offset = pos & (PAGE_SIZE - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 		const size_t size = ALIGN(length + offset, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 		const sector_t sector = dax_iomap_sector(iomap, pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 		ssize_t map_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 		pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 		void *kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 		if (fatal_signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 			ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 		if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 				&kaddr, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 		if (map_len < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 			ret = map_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 		map_len = PFN_PHYS(map_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 		kaddr += offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 		map_len -= offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 		if (map_len > end - pos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 			map_len = end - pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 		 * The userspace address for the memory copy has already been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) 		 * validated via access_ok() in either vfs_read() or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) 		 * vfs_write(), depending on which operation we are doing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) 		if (iov_iter_rw(iter) == WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 					map_len, iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 					map_len, iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 		pos += xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 		length -= xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 		done += xfer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 		if (xfer == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 			ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 		if (xfer < map_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 	dax_read_unlock(id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 	return done ? done : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)  * dax_iomap_rw - Perform I/O to a DAX file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205)  * @iocb:	The control block for this I/O
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206)  * @iter:	The addresses to do I/O from or to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)  * @ops:	iomap ops passed from the file system
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209)  * This function performs read and write operations to directly mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210)  * persistent memory.  The callers needs to take care of read/write exclusion
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211)  * and evicting any page cache pages in the region under I/O.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) ssize_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) 		const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) 	struct address_space *mapping = iocb->ki_filp->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) 	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) 	unsigned flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) 	if (iov_iter_rw(iter) == WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 		lockdep_assert_held_write(&inode->i_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) 		flags |= IOMAP_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 		lockdep_assert_held(&inode->i_rwsem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 	if (iocb->ki_flags & IOCB_NOWAIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 		flags |= IOMAP_NOWAIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 	while (iov_iter_count(iter)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 				iter, dax_iomap_actor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) 		if (ret <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) 		pos += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) 		done += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) 	iocb->ki_pos += done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) 	return done ? done : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) EXPORT_SYMBOL_GPL(dax_iomap_rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) static vm_fault_t dax_fault_return(int error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) 	if (error == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 		return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) 	return vmf_error(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254)  * MAP_SYNC on a dax mapping guarantees dirty metadata is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255)  * flushed on write-faults (non-cow), but not read-faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) static bool dax_fault_is_synchronous(unsigned long flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 		struct vm_area_struct *vma, struct iomap *iomap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 		&& (iomap->flags & IOMAP_F_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) 			       int *iomap_errp, const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) 	struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) 	struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) 	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) 	unsigned long vaddr = vmf->address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) 	struct iomap iomap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) 	struct iomap srcmap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) 	unsigned flags = IOMAP_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) 	int error, major = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) 	bool write = vmf->flags & FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) 	bool sync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 	vm_fault_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) 	pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) 	trace_dax_pte_fault(inode, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 	 * Check whether offset isn't beyond end of file now. Caller is supposed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) 	 * to hold locks serializing us with truncate / punch hole so this is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 	 * a reliable test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) 	if (pos >= i_size_read(inode)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) 		ret = VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) 	if (write && !vmf->cow_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) 		flags |= IOMAP_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) 	entry = grab_mapping_entry(&xas, mapping, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) 	if (xa_is_internal(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) 		ret = xa_to_internal(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) 	 * It is possible, particularly with mixed reads & writes to private
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) 	 * mappings, that we have raced with a PMD fault that overlaps with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) 	 * the PTE we need to set up.  If so just return and the fault will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) 	 * retried.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) 	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) 		ret = VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) 		goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) 	 * Note that we don't bother to use iomap_apply here: DAX required
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) 	 * the file system block size to be equal the page size, which means
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) 	 * that we never have to deal with more than a single extent here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) 	if (iomap_errp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) 		*iomap_errp = error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) 	if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) 		ret = dax_fault_return(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) 		goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) 		error = -EIO;	/* fs corruption? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) 		goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) 	if (vmf->cow_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) 		sector_t sector = dax_iomap_sector(&iomap, pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) 		switch (iomap.type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) 		case IOMAP_HOLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) 		case IOMAP_UNWRITTEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) 			clear_user_highpage(vmf->cow_page, vaddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) 		case IOMAP_MAPPED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) 			error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) 						  sector, vmf->cow_page, vaddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) 		default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) 			WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) 			error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) 		if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) 			goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) 		__SetPageUptodate(vmf->cow_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 		ret = finish_fault(vmf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) 		if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) 			ret = VM_FAULT_DONE_COW;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) 		goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) 	sync = dax_fault_is_synchronous(flags, vma, &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) 	switch (iomap.type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) 	case IOMAP_MAPPED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) 		if (iomap.flags & IOMAP_F_NEW) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) 			count_vm_event(PGMAJFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) 			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) 			major = VM_FAULT_MAJOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) 		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) 		if (error < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) 			goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) 		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) 						 0, write && !sync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) 		 * If we are doing synchronous page fault and inode needs fsync,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) 		 * we can insert PTE into page tables only after that happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) 		 * Skip insertion for now and return the pfn so that caller can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) 		 * insert it after fsync is done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) 		if (sync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) 			if (WARN_ON_ONCE(!pfnp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) 				error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) 				goto error_finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) 			*pfnp = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) 			ret = VM_FAULT_NEEDDSYNC | major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) 			goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) 		trace_dax_insert_mapping(inode, vmf, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) 		if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) 			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) 			ret = vmf_insert_mixed(vma, vaddr, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) 		goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) 	case IOMAP_UNWRITTEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) 	case IOMAP_HOLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) 		if (!write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) 			ret = dax_load_hole(&xas, mapping, &entry, vmf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) 			goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) 		fallthrough;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) 	default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) 		WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) 		error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410)  error_finish_iomap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) 	ret = dax_fault_return(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412)  finish_iomap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) 	if (ops->iomap_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) 		int copied = PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) 		if (ret & VM_FAULT_ERROR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) 			copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) 		 * The fault is done by now and there's no way back (other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) 		 * thread may be already happily using PTE we have installed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) 		 * Just ignore error from ->iomap_end since we cannot do much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) 		 * with it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426)  unlock_entry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) 	dax_unlock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428)  out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) 	trace_dax_pte_fault_done(inode, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) 	return ret | major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) #ifdef CONFIG_FS_DAX_PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) 		struct iomap *iomap, void **entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) 	unsigned long pmd_addr = vmf->address & PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) 	struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) 	pgtable_t pgtable = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) 	struct page *zero_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) 	pmd_t pmd_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) 	pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) 	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) 	if (unlikely(!zero_page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) 	pfn = page_to_pfn_t(zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) 	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) 			DAX_PMD | DAX_ZERO_PAGE, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) 	if (arch_needs_pgtable_deposit()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) 		pgtable = pte_alloc_one(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) 		if (!pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) 			return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) 	if (!pmd_none(*(vmf->pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) 		spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) 	if (pgtable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) 		mm_inc_nr_ptes(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) 	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) 	pmd_entry = pmd_mkhuge(pmd_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) 	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) 	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) 	return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) 	if (pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) 		pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) 	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) 	return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) 			       const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) 	struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) 	struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) 	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) 	unsigned long pmd_addr = vmf->address & PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) 	bool write = vmf->flags & FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) 	bool sync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) 	vm_fault_t result = VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) 	struct iomap iomap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) 	struct iomap srcmap = { .type = IOMAP_HOLE };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) 	pgoff_t max_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) 	loff_t pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) 	int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) 	pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) 	 * Check whether offset isn't beyond end of file now. Caller is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) 	 * supposed to hold locks serializing us with truncate / punch hole so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) 	 * this is a reliable test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) 	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) 	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) 	 * Make sure that the faulting address's PMD offset (color) matches
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) 	 * the PMD offset from the start of the file.  This is necessary so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) 	 * that a PMD range in the page table overlaps exactly with a PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) 	 * range in the page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) 	if ((vmf->pgoff & PG_PMD_COLOUR) !=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) 	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) 	/* Fall back to PTEs if we're going to COW */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) 	if (write && !(vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) 	/* If the PMD would extend outside the VMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) 	if (pmd_addr < vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) 	if (xas.xa_index >= max_pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) 		result = VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) 	/* If the PMD would extend beyond the file size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) 	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) 	 * grab_mapping_entry() will make sure we get an empty PMD entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) 	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) 	 * entry is already in the array, for instance), it will return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) 	 * VM_FAULT_FALLBACK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) 	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) 	if (xa_is_internal(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) 		result = xa_to_internal(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) 		goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) 	 * It is possible, particularly with mixed reads & writes to private
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) 	 * mappings, that we have raced with a PTE fault that overlaps with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) 	 * the PMD we need to set up.  If so just return and the fault will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) 	 * retried.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) 	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) 			!pmd_devmap(*vmf->pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) 		result = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) 		goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) 	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) 	 * setting up a mapping, so really we're using iomap_begin() as a way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) 	 * to look up our filesystem block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) 	pos = (loff_t)xas.xa_index << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) 			&srcmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) 	if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) 		goto unlock_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) 		goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) 	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) 	switch (iomap.type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) 	case IOMAP_MAPPED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) 		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) 		if (error < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) 			goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) 		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) 						DAX_PMD, write && !sync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) 		 * If we are doing synchronous page fault and inode needs fsync,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) 		 * we can insert PMD into page tables only after that happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) 		 * Skip insertion for now and return the pfn so that caller can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) 		 * insert it after fsync is done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) 		if (sync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) 			if (WARN_ON_ONCE(!pfnp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) 				goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) 			*pfnp = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) 			result = VM_FAULT_NEEDDSYNC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) 			goto finish_iomap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) 		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) 		result = vmf_insert_pfn_pmd(vmf, pfn, write);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) 	case IOMAP_UNWRITTEN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) 	case IOMAP_HOLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) 		if (WARN_ON_ONCE(write))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) 		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) 	default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) 		WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621)  finish_iomap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) 	if (ops->iomap_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) 		int copied = PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) 		if (result == VM_FAULT_FALLBACK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) 			copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) 		 * The fault is done by now and there's no way back (other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) 		 * thread may be already happily using PMD we have installed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) 		 * Just ignore error from ->iomap_end since we cannot do much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) 		 * with it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) 		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) 				&iomap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636)  unlock_entry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) 	dax_unlock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638)  fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) 	if (result == VM_FAULT_FALLBACK) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) 		split_huge_pmd(vma, vmf->pmd, vmf->address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) 		count_vm_event(THP_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) 	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) 	return result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) 			       const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) 	return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) #endif /* CONFIG_FS_DAX_PMD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656)  * dax_iomap_fault - handle a page fault on a DAX file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657)  * @vmf: The description of the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658)  * @pe_size: Size of the page to fault in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659)  * @pfnp: PFN to insert for synchronous faults if fsync is required
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660)  * @iomap_errp: Storage for detailed error code in case of error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661)  * @ops: Iomap ops passed from the file system
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663)  * When a page fault occurs, filesystems may call this helper in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664)  * their fault handler for DAX files. dax_iomap_fault() assumes the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665)  * has done all the necessary locking for page fault to proceed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666)  * successfully.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) 		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) 	switch (pe_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) 	case PE_SIZE_PTE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) 		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) 	case PE_SIZE_PMD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) 		return dax_iomap_pmd_fault(vmf, pfnp, ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) 	default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) 		return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) EXPORT_SYMBOL_GPL(dax_iomap_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683)  * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684)  * @vmf: The description of the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685)  * @pfn: PFN to insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686)  * @order: Order of entry to insert.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688)  * This function inserts a writeable PTE or PMD entry into the page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689)  * for an mmaped DAX file.  It also marks the page cache entry as dirty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) static vm_fault_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) 	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) 	void *entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) 	vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) 	xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) 	entry = get_unlocked_entry(&xas, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) 	/* Did we race with someone splitting entry or so? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) 	if (!entry || dax_is_conflict(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) 	    (order == 0 && !dax_is_pte_entry(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) 		put_unlocked_entry(&xas, entry, WAKE_NEXT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) 		xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) 		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) 						      VM_FAULT_NOPAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) 		return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) 	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) 	dax_lock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) 	xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) 	if (order == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) 		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) #ifdef CONFIG_FS_DAX_PMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) 	else if (order == PMD_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) 		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) 		ret = VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) 	dax_unlock_entry(&xas, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) 	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727)  * dax_finish_sync_fault - finish synchronous page fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728)  * @vmf: The description of the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729)  * @pe_size: Size of entry to be inserted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730)  * @pfn: PFN to insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732)  * This function ensures that the file range touched by the page fault is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733)  * stored persistently on the media and handles inserting of appropriate page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734)  * table entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) 		enum page_entry_size pe_size, pfn_t pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) 	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) 	unsigned int order = pe_order(pe_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) 	size_t len = PAGE_SIZE << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) 	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) 	if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) 		return VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) 	return dax_insert_pfn_mkwrite(vmf, pfn, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) EXPORT_SYMBOL_GPL(dax_finish_sync_fault);