^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * mm/readahead.c - address_space-level file readahead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2002, Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * 09Apr2002 Andrew Morton
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Initial version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/gfp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/task_io_accounting_ops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/blk-cgroup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/fadvise.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <trace/hooks/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * Initialise a struct file's readahead state. Assumes that the caller has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * memset *ra to zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) ra->prev_pos = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) EXPORT_SYMBOL_GPL(file_ra_state_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * see if a page needs releasing upon read_cache_pages() failure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * - the caller of read_cache_pages() may have set PG_private or PG_fscache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * before calling, such as the NFS fs marking pages that are cached locally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * on disk, thus we need to give the fs a chance to clean up in the event of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * an error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) static void read_cache_pages_invalidate_page(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) if (page_has_private(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) page->mapping = mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) do_invalidatepage(page, 0, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * release a list of pages, invalidating them first if need be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) static void read_cache_pages_invalidate_pages(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) struct list_head *pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) struct page *victim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) while (!list_empty(pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) victim = lru_to_page(pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) list_del(&victim->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) read_cache_pages_invalidate_page(mapping, victim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * read_cache_pages - populate an address space with some pages & start reads against them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * @mapping: the address_space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * @pages: The address of a list_head which contains the target pages. These
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * pages have their ->index populated and are otherwise uninitialised.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * @filler: callback routine for filling a single page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * @data: private data for the callback routine.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * Hides the details of the LRU cache etc from the filesystems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * Returns: %0 on success, error return by @filler otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) int read_cache_pages(struct address_space *mapping, struct list_head *pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) int (*filler)(void *, struct page *), void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) while (!list_empty(pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) page = lru_to_page(pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (add_to_page_cache_lru(page, mapping, page->index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) readahead_gfp_mask(mapping))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) read_cache_pages_invalidate_page(mapping, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) ret = filler(data, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) if (unlikely(ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) read_cache_pages_invalidate_pages(mapping, pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) task_io_account_read(PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) EXPORT_SYMBOL(read_cache_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) gfp_t readahead_gfp_mask(struct address_space *x)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) gfp_t mask = mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) trace_android_rvh_set_readahead_gfp_mask(&mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) return mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) EXPORT_SYMBOL_GPL(readahead_gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) static void read_pages(struct readahead_control *rac, struct list_head *pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) bool skip_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) const struct address_space_operations *aops = rac->mapping->a_ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) if (!readahead_count(rac))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) if (aops->readahead) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) aops->readahead(rac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) /* Clean up the remaining pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) while ((page = readahead_page(rac))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) } else if (aops->readpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) aops->readpages(rac->file, rac->mapping, pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) readahead_count(rac));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) /* Clean up the remaining pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) put_pages_list(pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) rac->_index += rac->_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) rac->_nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) while ((page = readahead_page(rac))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) aops->readpage(rac->file, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) BUG_ON(!list_empty(pages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) BUG_ON(readahead_count(rac));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if (skip_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) rac->_index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * page_cache_ra_unbounded - Start unchecked readahead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * @ractl: Readahead control.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) * @nr_to_read: The number of pages to read.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * @lookahead_size: Where to start the next readahead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * This function is for filesystems to call when they want to start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * readahead beyond a file's stated i_size. This is almost certainly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * not the function you want to call. Use page_cache_async_readahead()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * or page_cache_sync_readahead() instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * Context: File is referenced by caller. Mutexes may be held by caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * May sleep, but will not reenter filesystem to reclaim memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) void page_cache_ra_unbounded(struct readahead_control *ractl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) unsigned long nr_to_read, unsigned long lookahead_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) struct address_space *mapping = ractl->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) unsigned long index = readahead_index(ractl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) LIST_HEAD(page_pool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) gfp_t gfp_mask = readahead_gfp_mask(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) * Partway through the readahead operation, we will have added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) * locked pages to the page cache, but will not yet have submitted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * them for I/O. Adding another page may need to allocate memory,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * which can trigger memory reclaim. Telling the VM we're in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) * the middle of a filesystem operation will cause it to not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * touch file-backed pages, preventing a deadlock. Most (all?)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) * filesystems already specify __GFP_NOFS in their mapping's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) * gfp_mask, but let's be explicit here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) unsigned int nofs = memalloc_nofs_save();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * Preallocate as many pages as we will need.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) for (i = 0; i < nr_to_read; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) struct page *page = xa_load(&mapping->i_pages, index + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) if (page && !xa_is_value(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) * Page already present? Kick off the current batch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * of contiguous pages before continuing with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * next batch. This page may be the one we would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * have intended to mark as Readahead, but we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * have a stable reference to this page, and it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * not worth getting one just for that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) read_pages(ractl, &page_pool, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) page = __page_cache_alloc(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) if (mapping->a_ops->readpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) page->index = index + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) list_add(&page->lru, &page_pool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) } else if (add_to_page_cache_lru(page, mapping, index + i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) gfp_mask) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) read_pages(ractl, &page_pool, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (i == nr_to_read - lookahead_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) SetPageReadahead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) ractl->_nr_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * Now start the IO. We ignore I/O errors - if the page is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * uptodate then the caller will launch readpage again, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * will then handle the error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) read_pages(ractl, &page_pool, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) memalloc_nofs_restore(nofs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) * do_page_cache_ra() actually reads a chunk of disk. It allocates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) * the pages first, then submits them for I/O. This avoids the very bad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) * behaviour which would occur if page allocations are causing VM writeback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * We really don't want to intermingle reads and writes like that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) void do_page_cache_ra(struct readahead_control *ractl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) unsigned long nr_to_read, unsigned long lookahead_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) struct inode *inode = ractl->mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) unsigned long index = readahead_index(ractl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) loff_t isize = i_size_read(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) pgoff_t end_index; /* The last page we want to read */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) if (isize == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) end_index = (isize - 1) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (index > end_index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) /* Don't read past the page containing the last byte of the file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (nr_to_read > end_index - index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) nr_to_read = end_index - index + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) * Chunk the readahead into 2 megabyte units, so that we don't pin too much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) * memory at once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) void force_page_cache_ra(struct readahead_control *ractl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) struct file_ra_state *ra, unsigned long nr_to_read)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) struct address_space *mapping = ractl->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) unsigned long max_pages, index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) !mapping->a_ops->readahead))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * If the request exceeds the readahead window, allow the read to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * be up to the optimal hardware IO size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) index = readahead_index(ractl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) while (nr_to_read) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) if (this_chunk > nr_to_read)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) this_chunk = nr_to_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) ractl->_index = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) do_page_cache_ra(ractl, this_chunk, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) index += this_chunk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) nr_to_read -= this_chunk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) * Set the initial window size, round to next power of 2 and square
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) * for small size, x 4 for medium, and x 2 for large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * for 128k (32 page) max ra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) * 1-8 page = 32k initial, > 8 page = 128k initial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) unsigned long newsize = roundup_pow_of_two(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) if (newsize <= max / 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) newsize = newsize * 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) else if (newsize <= max / 4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) newsize = newsize * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) newsize = max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) return newsize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) * Get the previous window size, ramp it up, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) * return it as the new window size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) static unsigned long get_next_ra_size(struct file_ra_state *ra,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) unsigned long max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) unsigned long cur = ra->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) if (cur < max / 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) return 4 * cur;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) if (cur <= max / 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) return 2 * cur;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) return max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) * On-demand readahead design.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) * The fields in struct file_ra_state represent the most-recently-executed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) * readahead attempt:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) * |<----- async_size ---------|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) * |------------------- size -------------------->|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) * |==================#===========================|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * ^start ^page marked with PG_readahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * To overlap application thinking time and disk I/O time, we do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * `readahead pipelining': Do not wait until the application consumed all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) * readahead pages and stalled on the missing page at readahead_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) * Instead, submit an asynchronous readahead I/O as soon as there are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) * only async_size pages left in the readahead window. Normally async_size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) * will be equal to size, for maximum pipelining.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) * In interleaved sequential reads, concurrent streams on the same fd can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) * be invalidating each other's readahead state. So we flag the new readahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * page at (start+size-async_size) with PG_readahead, and use it as readahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) * indicator. The flag won't be set on already cached pages, to avoid the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * readahead-for-nothing fuss, saving pointless page cache lookups.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * prev_pos tracks the last visited byte in the _previous_ read request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) * It should be maintained by the caller, and will be used for detecting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) * small random reads. Note that the readahead algorithm checks loosely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) * for sequential patterns. Hence interleaved reads might be served as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) * sequential ones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) * There is a special-case: if the first page which the application tries to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) * read happens to be the first page of the file, it is assumed that a linear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) * read is about to happen and the window is immediately set to the initial size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) * based on I/O request size and the max_readahead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) * The code ramps up the readahead size aggressively at first, but slow down as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * it approaches max_readhead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * Count contiguously cached pages from @index-1 to @index-@max,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * this count is a conservative estimation of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * - length of the sequential read sequence, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * - thrashing threshold in memory tight systems
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) static pgoff_t count_history_pages(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) pgoff_t index, unsigned long max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) pgoff_t head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) head = page_cache_prev_miss(mapping, index - 1, max);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) return index - 1 - head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) * page cache context based read-ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) static int try_context_readahead(struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) struct file_ra_state *ra,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) pgoff_t index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) unsigned long req_size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) unsigned long max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) pgoff_t size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) size = count_history_pages(mapping, index, max);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) * not enough history pages:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) * it could be a random read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) if (size <= req_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) * starts from beginning of file:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) * it is a strong indication of long-run stream (or whole-file-read)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) if (size >= index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) size *= 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) ra->start = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) ra->size = min(size + req_size, max);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) ra->async_size = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) * A minimal readahead algorithm for trivial sequential/random reads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) static void ondemand_readahead(struct readahead_control *ractl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) struct file_ra_state *ra, bool hit_readahead_marker,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) unsigned long req_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) unsigned long max_pages = ra->ra_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) unsigned long add_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) unsigned long index = readahead_index(ractl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) pgoff_t prev_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * If the request exceeds the readahead window, allow the read to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) * be up to the optimal hardware IO size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) if (req_size > max_pages && bdi->io_pages > max_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) max_pages = min(req_size, bdi->io_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) trace_android_vh_ra_tuning_max_page(ractl, &max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * start of file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) if (!index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) goto initial_readahead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) * It's the expected callback index, assume sequential access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) * Ramp up sizes, and push forward the readahead window.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) if ((index == (ra->start + ra->size - ra->async_size) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) index == (ra->start + ra->size))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) ra->start += ra->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) ra->size = get_next_ra_size(ra, max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) ra->async_size = ra->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) goto readit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) * Hit a marked page without valid readahead state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) * E.g. interleaved reads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) * Query the pagecache for async_size, which normally equals to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) * readahead size. Ramp it up and use it as the new readahead size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) if (hit_readahead_marker) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) pgoff_t start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) start = page_cache_next_miss(ractl->mapping, index + 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) if (!start || start - index > max_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) ra->start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) ra->size = start - index; /* old async_size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) ra->size += req_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) ra->size = get_next_ra_size(ra, max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) ra->async_size = ra->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) goto readit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * oversize read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) if (req_size > max_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) goto initial_readahead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) * sequential cache miss
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) * trivial case: (index - prev_index) == 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * unaligned reads: (index - prev_index) == 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) if (index - prev_index <= 1UL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) goto initial_readahead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) * Query the page cache and look for the traces(cached history pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) * that a sequential stream would leave behind.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) if (try_context_readahead(ractl->mapping, ra, index, req_size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) max_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) goto readit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * standalone, small random read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) * Read as is, and do not pollute the readahead state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) do_page_cache_ra(ractl, req_size, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) initial_readahead:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) ra->start = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) ra->size = get_init_ra_size(req_size, max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) readit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) * Will this read hit the readahead marker made by itself?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) * If so, trigger the readahead marker hit now, and merge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * the resulted next readahead window into the current one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) * Take care of maximum IO pages as above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) if (index == ra->start && ra->size == ra->async_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) add_pages = get_next_ra_size(ra, max_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) if (ra->size + add_pages <= max_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) ra->async_size = add_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) ra->size += add_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) ra->size = max_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) ra->async_size = max_pages >> 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) ractl->_index = ra->start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) do_page_cache_ra(ractl, ra->size, ra->async_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) void page_cache_sync_ra(struct readahead_control *ractl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) struct file_ra_state *ra, unsigned long req_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * Even if read-ahead is disabled, issue this request as read-ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) * as we'll need it to satisfy the requested range. The forced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) * read-ahead will do the right thing and limit the read to just the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * requested range, which we'll set to 1 page for this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) if (!ra->ra_pages || blk_cgroup_congested()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) if (!ractl->file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) req_count = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) do_forced_ra = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) /* be dumb */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) if (do_forced_ra) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) force_page_cache_ra(ractl, ra, req_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) /* do read-ahead */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) ondemand_readahead(ractl, ra, false, req_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) EXPORT_SYMBOL_GPL(page_cache_sync_ra);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) void page_cache_async_ra(struct readahead_control *ractl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) struct file_ra_state *ra, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) unsigned long req_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) /* no read-ahead */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) if (!ra->ra_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) * Same bit is used for PG_readahead and PG_reclaim.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) if (PageWriteback(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) ClearPageReadahead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * Defer asynchronous read-ahead on IO congestion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if (inode_read_congested(ractl->mapping->host))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) if (blk_cgroup_congested())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) /* do read-ahead */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) ondemand_readahead(ractl, ra, true, req_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) EXPORT_SYMBOL_GPL(page_cache_async_ra);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) ssize_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) struct fd f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) ret = -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) f = fdget(fd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) if (!f.file || !(f.file->f_mode & FMODE_READ))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) * The readahead() syscall is intended to run only on files
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) * that can execute readahead. If readahead is not possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) * on this file, then we must return -EINVAL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) !S_ISREG(file_inode(f.file)->i_mode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) fdput(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) return ksys_readahead(fd, offset, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) }