^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * fs/direct-io.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2002, Linus Torvalds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * O_DIRECT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * 04Jul2002 Andrew Morton
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Initial version
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * 11Sep2002 janetinc@us.ibm.com
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * added readv/writev support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * 29Oct2002 Andrew Morton
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * rewrote bio_add_page() support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * 30Oct2002 pbadari@us.ibm.com
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * added support for non-aligned IO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * 06Nov2002 pbadari@us.ibm.com
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * added asynchronous IO support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * 21Jul2003 nathans@sgi.com
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * added IO completion notifier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/fscrypt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/task_io_accounting_ops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/bio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/wait.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/err.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/buffer_head.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/rwsem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/uio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/atomic.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/prefetch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * How many user pages to map in one call to get_user_pages(). This determines
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * the size of a structure in the slab cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #define DIO_PAGES 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * Flags for dio_complete()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * This code generally works in units of "dio_blocks". A dio_block is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * somewhere between the hard sector size and the filesystem block size. it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * is determined on a per-invocation basis. When talking to the filesystem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * to bio_block quantities by shifting left by blkfactor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * If blkfactor is zero then the user's request was aligned to the filesystem's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * blocksize.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) /* dio_state only used in the submission path */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) struct dio_submit {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) struct bio *bio; /* bio under assembly */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) unsigned blkbits; /* doesn't change */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) unsigned blkfactor; /* When we're using an alignment which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) is finer than the filesystem's soft
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) blocksize, this specifies how much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) finer. blkfactor=2 means 1/4-block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) alignment. Does not change */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) unsigned start_zero_done; /* flag: sub-blocksize zeroing has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) been performed at the start of a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) write */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) int pages_in_io; /* approximate total IO pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) sector_t block_in_file; /* Current offset into the underlying
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) file in dio_block units. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) unsigned blocks_available; /* At block_in_file. changes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) int reap_counter; /* rate limit reaping */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) sector_t final_block_in_request;/* doesn't change */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) int boundary; /* prev block is at a boundary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) get_block_t *get_block; /* block mapping function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) dio_submit_t *submit_io; /* IO submition function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) loff_t logical_offset_in_bio; /* current first logical block in bio */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) sector_t final_block_in_bio; /* current final block in bio + 1 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) sector_t next_block_for_io; /* next block to be put under IO,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) in dio_blocks units */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * Deferred addition of a page to the dio. These variables are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * private to dio_send_cur_page(), submit_page_section() and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * dio_bio_add_page().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) struct page *cur_page; /* The page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) unsigned cur_page_offset; /* Offset into it, in bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) sector_t cur_page_block; /* Where it starts */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) loff_t cur_page_fs_offset; /* Offset in file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) struct iov_iter *iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * Page queue. These variables belong to dio_refill_pages() and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * dio_get_page().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) unsigned head; /* next page to process */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) unsigned tail; /* last valid page + 1 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) size_t from, to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) /* dio_state communicated between submission path and end_io */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) struct dio {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) int flags; /* doesn't change */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) int op;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) int op_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) blk_qc_t bio_cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) struct gendisk *bio_disk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) struct inode *inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) loff_t i_size; /* i_size when submitted */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) dio_iodone_t *end_io; /* IO completion function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) void *private; /* copy from map_bh.b_private */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) /* BIO completion state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) spinlock_t bio_lock; /* protects BIO fields below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) int page_errors; /* errno from get_user_pages() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) int is_async; /* is IO async ? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) bool defer_completion; /* defer AIO completion to workqueue? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) bool should_dirty; /* if pages should be dirtied */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) int io_error; /* IO error in completion path */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) unsigned long refcount; /* direct_io_worker() and bios */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) struct bio *bio_list; /* singly linked via bi_private */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) struct task_struct *waiter; /* waiting task (NULL if none) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /* AIO related stuff */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) struct kiocb *iocb; /* kiocb */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) ssize_t result; /* IO result */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) * pages[] (and any fields placed after it) are not zeroed out at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * allocation time. Don't add new fields after pages[] unless you
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * wish that they not be zeroed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) struct page *pages[DIO_PAGES]; /* page buffer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) struct work_struct complete_work;/* deferred AIO completion */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) static struct kmem_cache *dio_cache __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * How many pages are in the queue?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) static inline unsigned dio_pages_present(struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) return sdio->tail - sdio->head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) ssize_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) &sdio->from);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) struct page *page = ZERO_PAGE(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * A memory fault, but the filesystem has some outstanding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * mapped blocks. We need to use those blocks up to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * leaking stale data in the file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) if (dio->page_errors == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) dio->page_errors = ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) dio->pages[0] = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) sdio->head = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) sdio->tail = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) sdio->from = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) sdio->to = PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) if (ret >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) iov_iter_advance(sdio->iter, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) ret += sdio->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) sdio->head = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * Get another userspace page. Returns an ERR_PTR on error. Pages are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) * buffered inside the dio so that we can call get_user_pages() against a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * decent number of pages, less frequently. To provide nicer use of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * L1 cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) static inline struct page *dio_get_page(struct dio *dio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) if (dio_pages_present(sdio) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) ret = dio_refill_pages(dio, sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) return ERR_PTR(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) BUG_ON(dio_pages_present(sdio) == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) return dio->pages[sdio->head];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * dio_complete() - called when all DIO BIO I/O has been completed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * This drops i_dio_count, lets interested parties know that a DIO operation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) * has completed, and calculates the resulting return code for the operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) * It lets the filesystem know if it registered an interest earlier via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) * get_block. Pass the private field of the map buffer_head so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) * filesystems can use it to hold additional state between get_block calls and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) * dio_complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) loff_t offset = dio->iocb->ki_pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) ssize_t transferred = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * AIO submission can race with bio completion to get here while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * expecting to have the last io completed by bio completion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * In that case -EIOCBQUEUED is in fact not an error we want
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) * to preserve through this call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) if (ret == -EIOCBQUEUED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) if (dio->result) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) transferred = dio->result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) /* Check for short read case */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) if ((dio->op == REQ_OP_READ) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) ((offset + transferred) > dio->i_size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) transferred = dio->i_size - offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) /* ignore EFAULT if some IO has been done */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) if (unlikely(ret == -EFAULT) && transferred)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) ret = dio->page_errors;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) ret = dio->io_error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) ret = transferred;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (dio->end_io) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) // XXX: ki_pos??
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) err = dio->end_io(dio->iocb, offset, ret, dio->private);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) ret = err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * Try again to invalidate clean pages which might have been cached by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) * non-direct readahead, or faulted in by get_user_pages() if the source
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) * of the write was an mmap'ed region of the file we're writing. Either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) * one is a pretty crazy thing to do, so we don't support it 100%. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) * this invalidation fails, tough, the write still worked...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) * And this page cache invalidation has to be after dio->end_io(), as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) * some filesystems convert unwritten extents to real allocations in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) * end_io() when necessary, otherwise a racing buffer read would cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) * zeros from unwritten extents.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) if (flags & DIO_COMPLETE_INVALIDATE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) ret > 0 && dio->op == REQ_OP_WRITE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) dio->inode->i_mapping->nrpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) err = invalidate_inode_pages2_range(dio->inode->i_mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) offset >> PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) (offset + ret - 1) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) dio_warn_stale_pagecache(dio->iocb->ki_filp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) inode_dio_end(dio->inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) if (flags & DIO_COMPLETE_ASYNC) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) * generic_write_sync expects ki_pos to have been updated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) * already, but the submission path only does this for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) * synchronous I/O.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) dio->iocb->ki_pos += transferred;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) if (ret > 0 && dio->op == REQ_OP_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) ret = generic_write_sync(dio->iocb, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) dio->iocb->ki_complete(dio->iocb, ret, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) kmem_cache_free(dio_cache, dio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) static void dio_aio_complete_work(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) struct dio *dio = container_of(work, struct dio, complete_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) * Asynchronous IO callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) static void dio_bio_end_aio(struct bio *bio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) struct dio *dio = bio->bi_private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) unsigned long remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) bool defer_completion = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) /* cleanup the bio */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) dio_bio_complete(dio, bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) remaining = --dio->refcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) if (remaining == 1 && dio->waiter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) wake_up_process(dio->waiter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) if (remaining == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) * Defer completion when defer_completion is set or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) * when the inode has pages mapped and this is AIO write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) * We need to invalidate those pages because there is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) * chance they contain stale data in the case buffered IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) * went in between AIO submission and completion into the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) * same region.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) if (dio->result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) defer_completion = dio->defer_completion ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) (dio->op == REQ_OP_WRITE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) dio->inode->i_mapping->nrpages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) if (defer_completion) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) INIT_WORK(&dio->complete_work, dio_aio_complete_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) queue_work(dio->inode->i_sb->s_dio_done_wq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) &dio->complete_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * The BIO completion handler simply queues the BIO up for the process-context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) * handler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) * During I/O bi_private points at the dio. After I/O, bi_private is used to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * implement a singly-linked list of completed BIOs, at dio->bio_list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) static void dio_bio_end_io(struct bio *bio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) struct dio *dio = bio->bi_private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) bio->bi_private = dio->bio_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) dio->bio_list = bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) if (--dio->refcount == 1 && dio->waiter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) wake_up_process(dio->waiter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) static inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) struct block_device *bdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) sector_t first_sector, int nr_vecs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) struct bio *bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) struct inode *inode = dio->inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) * bio_alloc() is guaranteed to return a bio when allowed to sleep and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) * we request a valid number of vectors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) bio = bio_alloc(GFP_KERNEL, nr_vecs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) fscrypt_set_bio_crypt_ctx(bio, inode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) sdio->cur_page_fs_offset >> inode->i_blkbits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) bio_set_dev(bio, bdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) bio->bi_iter.bi_sector = first_sector;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) bio_set_op_attrs(bio, dio->op, dio->op_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) if (dio->is_async)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) bio->bi_end_io = dio_bio_end_aio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) bio->bi_end_io = dio_bio_end_io;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) bio->bi_write_hint = dio->iocb->ki_hint;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) sdio->bio = bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) * In the AIO read case we speculatively dirty the pages before starting IO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) * During IO completion, any of these pages which happen to have been written
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) * back will be redirtied by bio_check_pages_dirty().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) * bios hold a dio reference between submit_bio and ->end_io.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) struct bio *bio = sdio->bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) bio->bi_private = dio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) dio->refcount++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) bio_set_pages_dirty(bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) dio->bio_disk = bio->bi_disk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) if (sdio->submit_io) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) dio->bio_cookie = BLK_QC_T_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) dio->bio_cookie = submit_bio(bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) sdio->bio = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) sdio->boundary = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) sdio->logical_offset_in_bio = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * Release any resources in case of a failure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) while (sdio->head < sdio->tail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) put_page(dio->pages[sdio->head++]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * Wait for the next BIO to complete. Remove it and return it. NULL is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * returned once all BIOs have been completed. This must only be called once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) * all bios have been issued so that dio->refcount can only decrease. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * requires that that the caller hold a reference on the dio.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) static struct bio *dio_await_one(struct dio *dio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) struct bio *bio = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) * Wait as long as the list is empty and there are bios in flight. bio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) * completion drops the count, maybe adds to the list, and wakes while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) * holding the bio_lock so we don't need set_current_state()'s barrier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) * and can call it after testing our condition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) while (dio->refcount > 1 && dio->bio_list == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) __set_current_state(TASK_UNINTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) dio->waiter = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) blk_io_schedule();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) /* wake up sets us TASK_RUNNING */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) dio->waiter = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) if (dio->bio_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) bio = dio->bio_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) dio->bio_list = bio->bi_private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) return bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) * Process one completed BIO. No locks are held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) blk_status_t err = bio->bi_status;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) dio->io_error = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) dio->io_error = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) if (dio->is_async && should_dirty) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) bio_check_pages_dirty(bio); /* transfers ownership */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) bio_release_pages(bio, should_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) bio_put(bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * Wait on and process all in-flight BIOs. This must only be called once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * all bios have been issued so that the refcount can only decrease.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * This just waits for all bios to make it through dio_bio_complete. IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * errors are propagated through dio->io_error and should be propagated via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * dio_complete().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) static void dio_await_completion(struct dio *dio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) struct bio *bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) bio = dio_await_one(dio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) if (bio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) dio_bio_complete(dio, bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) } while (bio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) * A really large O_DIRECT read or write can generate a lot of BIOs. So
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) * to keep the memory consumption sane we periodically reap any completed BIOs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * during the BIO generation phase.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) * This also helps to limit the peak amount of pinned userspace memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) if (sdio->reap_counter++ >= 64) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) while (dio->bio_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) struct bio *bio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) int ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) bio = dio->bio_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) dio->bio_list = bio->bi_private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) ret2 = blk_status_to_errno(dio_bio_complete(dio, bio));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) if (ret == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) ret = ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) sdio->reap_counter = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) * Create workqueue for deferred direct IO completions. We allocate the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) * workqueue when it's first needed. This avoids creating workqueue for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) * filesystems that don't need it and also allows us to create the workqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * late enough so the we can include s_id in the name of the workqueue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) int sb_init_dio_done_wq(struct super_block *sb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) struct workqueue_struct *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) struct workqueue_struct *wq = alloc_workqueue("dio/%s",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) WQ_MEM_RECLAIM, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) sb->s_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) if (!wq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) * This has to be atomic as more DIOs can race to create the workqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) /* Someone created workqueue before us? Free ours... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) if (old)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) destroy_workqueue(wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) static int dio_set_defer_completion(struct dio *dio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) struct super_block *sb = dio->inode->i_sb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) if (dio->defer_completion)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) dio->defer_completion = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) if (!sb->s_dio_done_wq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) return sb_init_dio_done_wq(sb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * Call into the fs to map some more disk blocks. We record the current number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) * of available blocks at sdio->blocks_available. These are in units of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) * fs blocksize, i_blocksize(inode).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) * The fs is allowed to map lots of blocks at once. If it wants to do that,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) * it uses the passed inode-relative block number as the file offset, as usual.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * get_block() is passed the number of i_blkbits-sized blocks which direct_io
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * has remaining to do. The fs should not map more than this number of blocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * If the fs has mapped a lot of blocks, it should populate bh->b_size to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * indicate how much contiguous disk space has been made available at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * bh->b_blocknr.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * If *any* of the mapped blocks are new, then the fs must set buffer_new().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) * This isn't very efficient...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) * In the case of filesystem holes: the fs may return an arbitrarily-large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) * hole by returning an appropriate value in b_size and by clearing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) * buffer_mapped(). However the direct-io code will only process holes one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) * block at a time - it will repeatedly call get_block() as it walks the hole.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) struct buffer_head *map_bh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) unsigned long fs_count; /* Number of filesystem-sized blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) int create;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) loff_t i_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) * If there was a memory error and we've overwritten all the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * mapped blocks then we can now return that memory error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) ret = dio->page_errors;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) if (ret == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) fs_startblk = sdio->block_in_file >> sdio->blkfactor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) fs_endblk = (sdio->final_block_in_request - 1) >>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) sdio->blkfactor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) fs_count = fs_endblk - fs_startblk + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) map_bh->b_state = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) map_bh->b_size = fs_count << i_blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) * For writes that could fill holes inside i_size on a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) * DIO_SKIP_HOLES filesystem we forbid block creations: only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) * overwrites are permitted. We will return early to the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) * once we see an unmapped buffer head returned, and the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * will fall back to buffered I/O.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * Otherwise the decision is left to the get_blocks method,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * which may decide to handle it or also return an unmapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) * buffer head.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) create = dio->op == REQ_OP_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) if (dio->flags & DIO_SKIP_HOLES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) i_size = i_size_read(dio->inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) create = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) ret = (*sdio->get_block)(dio->inode, fs_startblk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) map_bh, create);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) /* Store for completion */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) dio->private = map_bh->b_private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) if (ret == 0 && buffer_defer_completion(map_bh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) ret = dio_set_defer_completion(dio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) * There is no bio. Make one now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) sector_t start_sector, struct buffer_head *map_bh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) sector_t sector;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) int ret, nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) ret = dio_bio_reap(dio, sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) sector = start_sector << (sdio->blkbits - 9);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) BUG_ON(nr_pages <= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) sdio->boundary = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) * Attempt to put the current chunk of 'cur_page' into the current BIO. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) * that was successful then update final_block_in_bio and take a ref against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * the just-added page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * Return zero on success. Non-zero means the caller needs to start a new BIO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) static inline int dio_bio_add_page(struct dio_submit *sdio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) ret = bio_add_page(sdio->bio, sdio->cur_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) sdio->cur_page_len, sdio->cur_page_offset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) if (ret == sdio->cur_page_len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) * Decrement count only, if we are done with this page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) sdio->pages_in_io--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) get_page(sdio->cur_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) sdio->final_block_in_bio = sdio->cur_page_block +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) (sdio->cur_page_len >> sdio->blkbits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) * Put cur_page under IO. The section of cur_page which is described by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * starts on-disk at cur_page_block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * We take a ref against the page here (on behalf of its presence in the bio).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) * The caller of this function is responsible for removing cur_page from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) * dio, and for dropping the refcount which came from that presence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) struct buffer_head *map_bh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) if (sdio->bio) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) loff_t cur_offset = sdio->cur_page_fs_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) loff_t bio_next_offset = sdio->logical_offset_in_bio +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) sdio->bio->bi_iter.bi_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) * See whether this new request is contiguous with the old.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) * Btrfs cannot handle having logically non-contiguous requests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) * submitted. For example if you have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * Logical: [0-4095][HOLE][8192-12287]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * Physical: [0-4095] [4096-8191]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) * We cannot submit those pages together as one BIO. So if our
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) * current logical offset in the file does not equal what would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) * be the next logical offset in the bio, submit the bio we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * have.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) * When fscrypt inline encryption is used, data unit number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) * (DUN) contiguity is also required. Normally that's implied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) * by logical contiguity. However, certain IV generation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) * methods (e.g. IV_INO_LBLK_32) don't guarantee it. So, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) * must explicitly check fscrypt_mergeable_bio() too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) if (sdio->final_block_in_bio != sdio->cur_page_block ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) cur_offset != bio_next_offset ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) !fscrypt_mergeable_bio(sdio->bio, dio->inode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) cur_offset >> dio->inode->i_blkbits))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) dio_bio_submit(dio, sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) if (sdio->bio == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (dio_bio_add_page(sdio) != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) dio_bio_submit(dio, sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) if (ret == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) ret = dio_bio_add_page(sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) BUG_ON(ret != 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) * An autonomous function to put a chunk of a page under deferred IO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * The caller doesn't actually know (or care) whether this piece of page is in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) * a BIO, or is under IO or whatever. We just take care of all possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) * situations here. The separation between the logic of do_direct_IO() and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * that of submit_page_section() is important for clarity. Please don't break.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) * The chunk of page starts on-disk at blocknr.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * We perform deferred IO, by recording the last-submitted page inside our
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * private part of the dio structure. If possible, we just expand the IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) * across that page here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) * If that doesn't work out then we put the old page into the bio and add this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * page to the dio instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) static inline int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) unsigned offset, unsigned len, sector_t blocknr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) struct buffer_head *map_bh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) int boundary = sdio->boundary; /* dio_send_cur_page may clear it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) if (dio->op == REQ_OP_WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) * Read accounting is performed in submit_bio()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) task_io_account_write(len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) * Can we just grow the current page's presence in the dio?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) if (sdio->cur_page == page &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) sdio->cur_page_offset + sdio->cur_page_len == offset &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) sdio->cur_page_block +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) sdio->cur_page_len += len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) * If there's a deferred page already there then send it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) if (sdio->cur_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) ret = dio_send_cur_page(dio, sdio, map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) put_page(sdio->cur_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) sdio->cur_page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) get_page(page); /* It is in dio */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) sdio->cur_page = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) sdio->cur_page_offset = offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) sdio->cur_page_len = len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) sdio->cur_page_block = blocknr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * If boundary then we want to schedule the IO now to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) * avoid metadata seeks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) if (boundary) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) ret = dio_send_cur_page(dio, sdio, map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) if (sdio->bio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) dio_bio_submit(dio, sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) put_page(sdio->cur_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) sdio->cur_page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) * If we are not writing the entire block and get_block() allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) * the block for us, we need to fill-in the unused portion of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) * block with zeros. This happens only if user-buffer, fileoffset or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) * io length is not filesystem block-size multiple.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) * `end' is zero if we're doing the start of the IO, 1 at the end of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) * IO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) int end, struct buffer_head *map_bh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) unsigned dio_blocks_per_fs_block;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) unsigned this_chunk_blocks; /* In dio_blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) unsigned this_chunk_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) sdio->start_zero_done = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) if (!sdio->blkfactor || !buffer_new(map_bh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) dio_blocks_per_fs_block = 1 << sdio->blkfactor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) if (!this_chunk_blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) * We need to zero out part of an fs block. It is either at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) * beginning or the end of the fs block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) if (end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) page = ZERO_PAGE(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) sdio->next_block_for_io, map_bh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) sdio->next_block_for_io += this_chunk_blocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * Walk the user pages, and the file, mapping blocks to disk and generating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) * a sequence of (page,offset,len,block) mappings. These mappings are injected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) * into submit_page_section(), which takes care of the next stage of submission
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) * Direct IO against a blockdev is different from a file. Because we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) * happily perform page-sized but 512-byte aligned IOs. It is important that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) * blockdev IO be able to have fine alignment and large sizes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) * So what we do is to permit the ->get_block function to populate bh.b_size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) * with the size of IO which is permitted at this offset and this i_blkbits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) * For best results, the blockdev should be set up with 512-byte i_blkbits and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) * it should set b_size to PAGE_SIZE or more inside get_block(). This gives
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * fine alignment but still allows this function to work in PAGE_SIZE units.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) struct buffer_head *map_bh)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) const unsigned blkbits = sdio->blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) const unsigned i_blkbits = blkbits + sdio->blkfactor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) while (sdio->block_in_file < sdio->final_block_in_request) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) size_t from, to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) page = dio_get_page(dio, sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) if (IS_ERR(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) ret = PTR_ERR(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) from = sdio->head ? 0 : sdio->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) sdio->head++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) while (from < to) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) unsigned this_chunk_bytes; /* # of bytes mapped */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) unsigned this_chunk_blocks; /* # of blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) unsigned u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) if (sdio->blocks_available == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) * Need to go and map some more disk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) unsigned long blkmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) unsigned long dio_remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) ret = get_more_blocks(dio, sdio, map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) if (!buffer_mapped(map_bh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) goto do_holes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) sdio->blocks_available =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) map_bh->b_size >> blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) sdio->next_block_for_io =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) map_bh->b_blocknr << sdio->blkfactor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) if (buffer_new(map_bh)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) clean_bdev_aliases(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) map_bh->b_bdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) map_bh->b_blocknr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) map_bh->b_size >> i_blkbits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) if (!sdio->blkfactor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) goto do_holes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) blkmask = (1 << sdio->blkfactor) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) dio_remainder = (sdio->block_in_file & blkmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * If we are at the start of IO and that IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) * starts partway into a fs-block,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * dio_remainder will be non-zero. If the IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * is a read then we can simply advance the IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * cursor to the first block which is to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * read. But if the IO is a write and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) * block was newly allocated we cannot do that;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) * the start of the fs block must be zeroed out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) * on-disk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) if (!buffer_new(map_bh))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) sdio->next_block_for_io += dio_remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) sdio->blocks_available -= dio_remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) do_holes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) /* Handle holes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) if (!buffer_mapped(map_bh)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) loff_t i_size_aligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) /* AKPM: eargh, -ENOTBLK is a hack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) if (dio->op == REQ_OP_WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) return -ENOTBLK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) * Be sure to account for a partial block as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) * last block in the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) i_size_aligned = ALIGN(i_size_read(dio->inode),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 1 << blkbits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) if (sdio->block_in_file >=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) i_size_aligned >> blkbits) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) /* We hit eof */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) zero_user(page, from, 1 << blkbits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) sdio->block_in_file++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) from += 1 << blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) dio->result += 1 << blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) goto next_block;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) * If we're performing IO which has an alignment which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) * is finer than the underlying fs, go check to see if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) * we must zero out the start of this block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) dio_zero_block(dio, sdio, 0, map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) * Work out, in this_chunk_blocks, how much disk we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * can add to this page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) this_chunk_blocks = sdio->blocks_available;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) u = (to - from) >> blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) if (this_chunk_blocks > u)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) this_chunk_blocks = u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) u = sdio->final_block_in_request - sdio->block_in_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) if (this_chunk_blocks > u)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) this_chunk_blocks = u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) this_chunk_bytes = this_chunk_blocks << blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) BUG_ON(this_chunk_bytes == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) if (this_chunk_blocks == sdio->blocks_available)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) sdio->boundary = buffer_boundary(map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) ret = submit_page_section(dio, sdio, page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) from,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) this_chunk_bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) sdio->next_block_for_io,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) sdio->next_block_for_io += this_chunk_blocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) sdio->block_in_file += this_chunk_blocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) from += this_chunk_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) dio->result += this_chunk_bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) sdio->blocks_available -= this_chunk_blocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) next_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) if (sdio->block_in_file == sdio->final_block_in_request)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) /* Drop the ref which was taken in get_user_pages() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) static inline int drop_refcount(struct dio *dio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) int ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) * Sync will always be dropping the final ref and completing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) * operation. AIO can if it was a broken operation described above or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * in fact if all the bios race to complete before we get here. In
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * that case dio_complete() translates the EIOCBQUEUED into the proper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) * return code that the caller will hand to ->complete().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) * This is managed by the bio_lock instead of being an atomic_t so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) * completion paths can drop their ref and use the remaining count to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) * decide to wake the submission path atomically.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) spin_lock_irqsave(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) ret2 = --dio->refcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) spin_unlock_irqrestore(&dio->bio_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) return ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) * This is a library function for use by filesystem drivers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) * The locking rules are governed by the flags parameter:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) * - if the flags value contains DIO_LOCKING we use a fancy locking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) * scheme for dumb filesystems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) * For writes this function is called under i_mutex and returns with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) * i_mutex held, for reads, i_mutex is not held on entry, but it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) * taken and dropped again before returning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) * - if the flags value does NOT contain DIO_LOCKING we don't use any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) * internal locking but rather rely on the filesystem to synchronize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) * direct I/O reads/writes versus each other and truncate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * To help with locking against truncate we incremented the i_dio_count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) * counter before starting direct I/O, and decrement it once we are done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * Truncate can wait for it to reach zero to provide exclusion. It is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) * expected that filesystem provide exclusion between new direct I/O
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) * but other filesystems need to take care of this on their own.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) * NOTE: if you pass "sdio" to anything by pointer make sure that function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) * is always inlined. Otherwise gcc is unable to split the structure into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) * individual fields and will generate much worse code. This is important
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * for the whole file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) static inline ssize_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) struct block_device *bdev, struct iov_iter *iter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) get_block_t get_block, dio_iodone_t end_io,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) dio_submit_t submit_io, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) unsigned blkbits = i_blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) unsigned blocksize_mask = (1 << blkbits) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) ssize_t retval = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) const size_t count = iov_iter_count(iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) loff_t offset = iocb->ki_pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) const loff_t end = offset + count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) struct dio *dio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) struct dio_submit sdio = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) struct buffer_head map_bh = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) unsigned long align = offset | iov_iter_alignment(iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) * Avoid references to bdev if not absolutely needed to give
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) * the early prefetch in the caller enough time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) /* watch out for a 0 len io from a tricksy fs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) if (iov_iter_rw(iter) == READ && !count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) if (!dio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) * Believe it or not, zeroing out the page array caused a .5%
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) * performance regression in a database benchmark. So, we take
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) * care to only zero out what's needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) memset(dio, 0, offsetof(struct dio, pages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) dio->flags = flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) /* will be released by direct_io_worker */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) inode_lock(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) /* Once we sampled i_size check for reads beyond EOF */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) dio->i_size = i_size_read(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) goto fail_dio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) if (align & blocksize_mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) if (bdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) blkbits = blksize_bits(bdev_logical_block_size(bdev));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) blocksize_mask = (1 << blkbits) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) if (align & blocksize_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) goto fail_dio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) struct address_space *mapping = iocb->ki_filp->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) retval = filemap_write_and_wait_range(mapping, offset, end - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) if (retval)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) goto fail_dio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) * For file extending writes updating i_size before data writeouts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) * complete can expose uninitialized blocks in dumb filesystems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * In that case we need to wait for I/O completion even if asked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * for an asynchronous write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) if (is_sync_kiocb(iocb))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) dio->is_async = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) dio->is_async = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) dio->is_async = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) dio->inode = inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) if (iov_iter_rw(iter) == WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) dio->op = REQ_OP_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) dio->op_flags = REQ_SYNC | REQ_IDLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) if (iocb->ki_flags & IOCB_NOWAIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) dio->op_flags |= REQ_NOWAIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) dio->op = REQ_OP_READ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) if (iocb->ki_flags & IOCB_HIPRI)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) dio->op_flags |= REQ_HIPRI;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) * so that we can call ->fsync.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) if (dio->is_async && iov_iter_rw(iter) == WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) if (iocb->ki_flags & IOCB_DSYNC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) retval = dio_set_defer_completion(dio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) else if (!dio->inode->i_sb->s_dio_done_wq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) * In case of AIO write racing with buffered read we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) * need to defer completion. We can't decide this now,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) * however the workqueue needs to be initialized here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) retval = sb_init_dio_done_wq(dio->inode->i_sb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) if (retval)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) goto fail_dio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) * Will be decremented at I/O completion time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) inode_dio_begin(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) sdio.blkbits = blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) sdio.blkfactor = i_blkbits - blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) sdio.block_in_file = offset >> blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) sdio.get_block = get_block;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) dio->end_io = end_io;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) sdio.submit_io = submit_io;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) sdio.final_block_in_bio = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) sdio.next_block_for_io = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) dio->iocb = iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) spin_lock_init(&dio->bio_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) dio->refcount = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) sdio.iter = iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) sdio.final_block_in_request = end >> blkbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) * In case of non-aligned buffers, we may need 2 more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * pages since we need to zero out first and last block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) if (unlikely(sdio.blkfactor))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) sdio.pages_in_io = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) retval = do_direct_IO(dio, &sdio, &map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) if (retval)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) dio_cleanup(dio, &sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) if (retval == -ENOTBLK) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) * The remaining part of the request will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) * be handled by buffered I/O when we return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) * There may be some unwritten disk at the end of a part-written
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) * fs-block-sized block. Go zero that now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) dio_zero_block(dio, &sdio, 1, &map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) if (sdio.cur_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) ssize_t ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) if (retval == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) retval = ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) put_page(sdio.cur_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) sdio.cur_page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) if (sdio.bio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) dio_bio_submit(dio, &sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) * It is possible that, we return short IO due to end of file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) * In that case, we need to release all the pages we got hold on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) dio_cleanup(dio, &sdio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) * All block lookups have been performed. For READ requests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) * we can let i_mutex go now that its achieved its purpose
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) * of protecting us from looking up uninitialized blocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) inode_unlock(dio->inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) * The only time we want to leave bios in flight is when a successful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) * partial aio read or full aio write have been setup. In that case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) * bio completion will call aio_complete. The only time it's safe to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) * This had *better* be the only place that raises -EIOCBQUEUED.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) BUG_ON(retval == -EIOCBQUEUED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) if (dio->is_async && retval == 0 && dio->result &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) (iov_iter_rw(iter) == READ || dio->result == count))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) retval = -EIOCBQUEUED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) dio_await_completion(dio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) if (drop_refcount(dio) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) BUG_ON(retval != -EIOCBQUEUED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) fail_dio:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) inode_unlock(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) kmem_cache_free(dio_cache, dio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) struct block_device *bdev, struct iov_iter *iter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) get_block_t get_block,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) dio_iodone_t end_io, dio_submit_t submit_io,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) * The block device state is needed in the end to finally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) * submit everything. Since it's likely to be cache cold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) * prefetch it here as first thing to hide some of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) * latency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) * Attempt to prefetch the pieces we likely need later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) prefetch(&bdev->bd_disk->part_tbl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) prefetch(bdev->bd_disk->queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) end_io, submit_io, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) EXPORT_SYMBOL_NS(__blockdev_direct_IO, ANDROID_GKI_VFS_EXPORT_ONLY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) static __init int dio_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) module_init(dio_init)