^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) * An async IO implementation for Linux
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Written by Benjamin LaHaise <bcrl@kvack.org>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Implements an efficient asynchronous io interface.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Copyright 2018 Christoph Hellwig.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * See ../COPYING for licensing terms.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #define pr_fmt(fmt) "%s: " fmt, __func__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/time.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/aio_abi.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/refcount.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/uio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/mman.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/percpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/timer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/aio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/workqueue.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/security.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/eventfd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/compat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/migrate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/ramfs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/percpu-refcount.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/mount.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/pseudo_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <asm/kmap_types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <linux/nospec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #define KIOCB_KEY 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define AIO_RING_MAGIC 0xa10a10a1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #define AIO_RING_COMPAT_FEATURES 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #define AIO_RING_INCOMPAT_FEATURES 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) struct aio_ring {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) unsigned id; /* kernel internal index number */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) unsigned nr; /* number of io_events */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) unsigned head; /* Written to by userland or under ring_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * mutex by aio_read_events_ring(). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) unsigned tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) unsigned magic;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) unsigned compat_features;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) unsigned incompat_features;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) unsigned header_length; /* size of aio_ring */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) struct io_event io_events[];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) }; /* 128 bytes + ring size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * Plugging is meant to work with larger batches of IOs. If we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * have more than the below, then don't bother setting up a plug.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #define AIO_PLUG_THRESHOLD 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) #define AIO_RING_PAGES 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) struct kioctx_table {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) struct rcu_head rcu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) unsigned nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) struct kioctx __rcu *table[];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) struct kioctx_cpu {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) unsigned reqs_available;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) struct ctx_rq_wait {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) struct completion comp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) atomic_t count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) struct kioctx {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) struct percpu_ref users;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) atomic_t dead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) struct percpu_ref reqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) unsigned long user_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) struct __percpu kioctx_cpu *cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * For percpu reqs_available, number of slots we move to/from global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * counter at a time:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) unsigned req_batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * This is what userspace passed to io_setup(), it's not used for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * anything but counting against the global max_reqs quota.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * The real limit is nr_events - 1, which will be larger (see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * aio_setup_ring())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) unsigned max_reqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) /* Size of ringbuffer, in units of struct io_event */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) unsigned nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) unsigned long mmap_base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) unsigned long mmap_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) struct page **ring_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) long nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) struct rcu_work free_rwork; /* see free_ioctx() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * signals when all in-flight requests are done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) struct ctx_rq_wait *rq_wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * This counts the number of available slots in the ringbuffer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * so we avoid overflowing it: it's decremented (if positive)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) * when allocating a kiocb and incremented when the resulting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) * io_event is pulled off the ringbuffer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * We batch accesses to it with a percpu version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) atomic_t reqs_available;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) spinlock_t ctx_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) struct list_head active_reqs; /* used for cancellation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) struct mutex ring_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) wait_queue_head_t wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) unsigned tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) unsigned completed_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) spinlock_t completion_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) } ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) struct page *internal_pages[AIO_RING_PAGES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) struct file *aio_ring_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) unsigned id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * First field must be the file pointer in all the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * iocb unions! See also 'struct kiocb' in <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) struct fsync_iocb {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) struct work_struct work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) bool datasync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) struct cred *creds;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) struct poll_iocb {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) struct wait_queue_head *head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) __poll_t events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) bool cancelled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) bool work_scheduled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) bool work_need_resched;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) struct wait_queue_entry wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) struct work_struct work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) * NOTE! Each of the iocb union members has the file pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) * as the first entry in their struct definition. So you can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) * access the file pointer through any of the sub-structs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * or directly as just 'ki_filp' in this struct.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) struct aio_kiocb {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) struct file *ki_filp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) struct kiocb rw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) struct fsync_iocb fsync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) struct poll_iocb poll;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) struct kioctx *ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) kiocb_cancel_fn *ki_cancel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) struct io_event ki_res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) struct list_head ki_list; /* the aio core uses this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) * for cancellation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) refcount_t ki_refcnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * If the aio_resfd field of the userspace iocb is not zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * this is the underlying eventfd context to deliver events to.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) struct eventfd_ctx *ki_eventfd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) /*------ sysctl variables----*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) static DEFINE_SPINLOCK(aio_nr_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) unsigned long aio_nr; /* current system wide number of aio requests */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) /*----end sysctl variables---*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) static struct kmem_cache *kiocb_cachep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) static struct kmem_cache *kioctx_cachep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) static struct vfsmount *aio_mnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) static const struct file_operations aio_ring_fops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) static const struct address_space_operations aio_ctx_aops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) if (IS_ERR(inode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) return ERR_CAST(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) inode->i_mapping->a_ops = &aio_ctx_aops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) inode->i_mapping->private_data = ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) inode->i_size = PAGE_SIZE * nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) O_RDWR, &aio_ring_fops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) if (IS_ERR(file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) iput(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) return file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) static int aio_init_fs_context(struct fs_context *fc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) if (!init_pseudo(fc, AIO_RING_MAGIC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) fc->s_iflags |= SB_I_NOEXEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) /* aio_setup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) * Creates the slab caches used by the aio routines, panic on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * failure as this is done early during the boot sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) static int __init aio_setup(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) static struct file_system_type aio_fs = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) .name = "aio",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) .init_fs_context = aio_init_fs_context,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) .kill_sb = kill_anon_super,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) aio_mnt = kern_mount(&aio_fs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (IS_ERR(aio_mnt))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) panic("Failed to create aio fs mount.");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) __initcall(aio_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) static void put_aio_ring_file(struct kioctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) struct file *aio_ring_file = ctx->aio_ring_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) struct address_space *i_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) if (aio_ring_file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) truncate_setsize(file_inode(aio_ring_file), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) /* Prevent further access to the kioctx from migratepages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) i_mapping = aio_ring_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) spin_lock(&i_mapping->private_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) i_mapping->private_data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) ctx->aio_ring_file = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) spin_unlock(&i_mapping->private_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) fput(aio_ring_file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) static void aio_free_ring(struct kioctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) /* Disconnect the kiotx from the ring file. This prevents future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) * accesses to the kioctx from page migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) put_aio_ring_file(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) for (i = 0; i < ctx->nr_pages; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) page_count(ctx->ring_pages[i]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) page = ctx->ring_pages[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) ctx->ring_pages[i] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) kfree(ctx->ring_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) ctx->ring_pages = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) static int aio_ring_mremap(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) struct file *file = vma->vm_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) struct kioctx_table *table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) int i, res = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) spin_lock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) table = rcu_dereference(mm->ioctx_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) for (i = 0; i < table->nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) struct kioctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) ctx = rcu_dereference(table->table[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) if (ctx && ctx->aio_ring_file == file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) if (!atomic_read(&ctx->dead)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) ctx->user_id = ctx->mmap_base = vma->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) res = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) spin_unlock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) static const struct vm_operations_struct aio_ring_vm_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) .mremap = aio_ring_mremap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) #if IS_ENABLED(CONFIG_MMU)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) .fault = filemap_fault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) .map_pages = filemap_map_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) .page_mkwrite = filemap_page_mkwrite,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) vma->vm_flags |= VM_DONTEXPAND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) vma->vm_ops = &aio_ring_vm_ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) static const struct file_operations aio_ring_fops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) .mmap = aio_ring_mmap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) #if IS_ENABLED(CONFIG_MIGRATION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) static int aio_migratepage(struct address_space *mapping, struct page *new,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) struct page *old, enum migrate_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) struct kioctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) * We cannot support the _NO_COPY case here, because copy needs to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * happen under the ctx->completion_lock. That does not work with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) * migration workflow of MIGRATE_SYNC_NO_COPY.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) if (mode == MIGRATE_SYNC_NO_COPY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) /* mapping->private_lock here protects against the kioctx teardown. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) spin_lock(&mapping->private_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) ctx = mapping->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) if (!ctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) /* The ring_lock mutex. The prevents aio_read_events() from writing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) * to the ring's head, and prevents page migration from mucking in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) * a partially initialized kiotx.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) if (!mutex_trylock(&ctx->ring_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) rc = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) idx = old->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) if (idx < (pgoff_t)ctx->nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) /* Make sure the old page hasn't already been changed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) if (ctx->ring_pages[idx] != old)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) rc = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) if (rc != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) /* Writeback must be complete */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) BUG_ON(PageWriteback(old));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) get_page(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) rc = migrate_page_move_mapping(mapping, new, old, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) if (rc != MIGRATEPAGE_SUCCESS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) put_page(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) /* Take completion_lock to prevent other writes to the ring buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) * while the old page is copied to the new. This prevents new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) * events from being lost.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) spin_lock_irqsave(&ctx->completion_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) migrate_page_copy(new, old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) BUG_ON(ctx->ring_pages[idx] != old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) ctx->ring_pages[idx] = new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) spin_unlock_irqrestore(&ctx->completion_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) /* The old page is no longer accessible. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) put_page(old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) mutex_unlock(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) spin_unlock(&mapping->private_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) static const struct address_space_operations aio_ctx_aops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) .set_page_dirty = __set_page_dirty_no_writeback,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) #if IS_ENABLED(CONFIG_MIGRATION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) .migratepage = aio_migratepage,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) struct aio_ring *ring;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) struct mm_struct *mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) unsigned long size, unused;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) /* Compensate for the ring buffer's head/tail overlap entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) nr_events += 2; /* 1 is required, 2 for good luck */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) size = sizeof(struct aio_ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) size += sizeof(struct io_event) * nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) nr_pages = PFN_UP(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) if (nr_pages < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) file = aio_private_file(ctx, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) if (IS_ERR(file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) ctx->aio_ring_file = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) ctx->aio_ring_file = file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) / sizeof(struct io_event);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) ctx->ring_pages = ctx->internal_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) if (nr_pages > AIO_RING_PAGES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) if (!ctx->ring_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) put_aio_ring_file(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) for (i = 0; i < nr_pages; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) page = find_or_create_page(file->f_mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) i, GFP_HIGHUSER | __GFP_ZERO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) pr_debug("pid(%d) page[%d]->count=%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) current->pid, i, page_count(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) ctx->ring_pages[i] = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) ctx->nr_pages = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) if (unlikely(i != nr_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) aio_free_ring(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) ctx->mmap_size = nr_pages * PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) if (mmap_write_lock_killable(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) ctx->mmap_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) aio_free_ring(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) PROT_READ | PROT_WRITE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) MAP_SHARED, 0, &unused, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) if (IS_ERR((void *)ctx->mmap_base)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) ctx->mmap_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) aio_free_ring(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) ctx->user_id = ctx->mmap_base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) ctx->nr_events = nr_events; /* trusted copy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) ring = kmap_atomic(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) ring->nr = nr_events; /* user copy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) ring->id = ~0U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) ring->head = ring->tail = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) ring->magic = AIO_RING_MAGIC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) ring->compat_features = AIO_RING_COMPAT_FEATURES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) ring->header_length = sizeof(struct aio_ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) kunmap_atomic(ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) flush_dcache_page(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) struct kioctx *ctx = req->ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) spin_lock_irqsave(&ctx->ctx_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) list_add_tail(&req->ki_list, &ctx->active_reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) req->ki_cancel = cancel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) spin_unlock_irqrestore(&ctx->ctx_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) EXPORT_SYMBOL(kiocb_set_cancel_fn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) * free_ioctx() should be RCU delayed to synchronize against the RCU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) * protected lookup_ioctx() and also needs process context to call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) * aio_free_ring(). Use rcu_work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) static void free_ioctx(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) free_rwork);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) pr_debug("freeing %p\n", ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) aio_free_ring(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) free_percpu(ctx->cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) percpu_ref_exit(&ctx->reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) percpu_ref_exit(&ctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) kmem_cache_free(kioctx_cachep, ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) static void free_ioctx_reqs(struct percpu_ref *ref)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) /* At this point we know that there are no any in-flight requests */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) complete(&ctx->rq_wait->comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* Synchronize against RCU protected table->table[] dereferences */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) queue_rcu_work(system_wq, &ctx->free_rwork);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * When this function runs, the kioctx has been removed from the "hash table"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) * now it's safe to cancel any that need to be.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) static void free_ioctx_users(struct percpu_ref *ref)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) struct kioctx *ctx = container_of(ref, struct kioctx, users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) struct aio_kiocb *req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) spin_lock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) while (!list_empty(&ctx->active_reqs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) req = list_first_entry(&ctx->active_reqs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) struct aio_kiocb, ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) req->ki_cancel(&req->rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) list_del_init(&req->ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) spin_unlock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) percpu_ref_kill(&ctx->reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) percpu_ref_put(&ctx->reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) unsigned i, new_nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) struct kioctx_table *table, *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) struct aio_ring *ring;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) spin_lock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) table = rcu_dereference_raw(mm->ioctx_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) while (1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) if (table)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) for (i = 0; i < table->nr; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) if (!rcu_access_pointer(table->table[i])) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) ctx->id = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) rcu_assign_pointer(table->table[i], ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) spin_unlock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) /* While kioctx setup is in progress,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * we are protected from page migration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) * changes ring_pages by ->ring_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) ring = kmap_atomic(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) ring->id = ctx->id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) kunmap_atomic(ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) new_nr = (table ? table->nr : 1) * 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) spin_unlock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) new_nr, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) if (!table)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) table->nr = new_nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) spin_lock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) old = rcu_dereference_raw(mm->ioctx_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) if (!old) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) rcu_assign_pointer(mm->ioctx_table, table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) } else if (table->nr > old->nr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) memcpy(table->table, old->table,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) old->nr * sizeof(struct kioctx *));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) rcu_assign_pointer(mm->ioctx_table, table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) kfree_rcu(old, rcu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) kfree(table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) table = old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) static void aio_nr_sub(unsigned nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) spin_lock(&aio_nr_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) if (WARN_ON(aio_nr - nr > aio_nr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) aio_nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) aio_nr -= nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) spin_unlock(&aio_nr_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) /* ioctx_alloc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) static struct kioctx *ioctx_alloc(unsigned nr_events)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) struct mm_struct *mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) struct kioctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) int err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) * Store the original nr_events -- what userspace passed to io_setup(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) * for counting against the global limit -- before it changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) unsigned int max_reqs = nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) * We keep track of the number of available ringbuffer slots, to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) * overflow (reqs_available), and we also use percpu counters for this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) * So since up to half the slots might be on other cpu's percpu counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) * and unavailable, double nr_events so userspace sees what they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) * expected: additionally, we move req_batch slots to/from percpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * counters at a time, so make sure that isn't 0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) nr_events = max(nr_events, num_possible_cpus() * 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) nr_events *= 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) /* Prevent overflows */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) if (nr_events > (0x10000000U / sizeof(struct io_event))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) pr_debug("ENOMEM: nr_events too high\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) return ERR_PTR(-EAGAIN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) if (!ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) ctx->max_reqs = max_reqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) spin_lock_init(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) spin_lock_init(&ctx->completion_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) mutex_init(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) /* Protect against page migration throughout kiotx setup by keeping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) * the ring_lock mutex held until setup is complete. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) mutex_lock(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) init_waitqueue_head(&ctx->wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) INIT_LIST_HEAD(&ctx->active_reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) ctx->cpu = alloc_percpu(struct kioctx_cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) if (!ctx->cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) err = aio_setup_ring(ctx, nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) if (err < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) if (ctx->req_batch < 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) ctx->req_batch = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) /* limit the number of system wide aios */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) spin_lock(&aio_nr_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) if (aio_nr + ctx->max_reqs > aio_max_nr ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) aio_nr + ctx->max_reqs < aio_nr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) spin_unlock(&aio_nr_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) err = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) goto err_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) aio_nr += ctx->max_reqs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) spin_unlock(&aio_nr_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) err = ioctx_add_table(ctx, mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) goto err_cleanup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) /* Release the ring_lock mutex now that all setup is complete. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) mutex_unlock(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) ctx, ctx->user_id, mm, ctx->nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) return ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) err_cleanup:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) aio_nr_sub(ctx->max_reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) err_ctx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) atomic_set(&ctx->dead, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) if (ctx->mmap_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) vm_munmap(ctx->mmap_base, ctx->mmap_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) aio_free_ring(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) mutex_unlock(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) free_percpu(ctx->cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) percpu_ref_exit(&ctx->reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) percpu_ref_exit(&ctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) kmem_cache_free(kioctx_cachep, ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) pr_debug("error allocating ioctx %d\n", err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) return ERR_PTR(err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) /* kill_ioctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * Cancels all outstanding aio requests on an aio context. Used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * when the processes owning a context have all exited to encourage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) * the rapid destruction of the kioctx.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) struct ctx_rq_wait *wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) struct kioctx_table *table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) spin_lock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) if (atomic_xchg(&ctx->dead, 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) spin_unlock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) table = rcu_dereference_raw(mm->ioctx_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) RCU_INIT_POINTER(table->table[ctx->id], NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) spin_unlock(&mm->ioctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) /* free_ioctx_reqs() will do the necessary RCU synchronization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) wake_up_all(&ctx->wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) * It'd be more correct to do this in free_ioctx(), after all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) * the outstanding kiocbs have finished - but by then io_destroy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) * has already returned, so io_setup() could potentially return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) * -EAGAIN with no ioctxs actually in use (as far as userspace
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) * could tell).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) aio_nr_sub(ctx->max_reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) if (ctx->mmap_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) vm_munmap(ctx->mmap_base, ctx->mmap_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) ctx->rq_wait = wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) percpu_ref_kill(&ctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) * exit_aio: called when the last user of mm goes away. At this point, there is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) * no way for any new requests to be submited or any of the io_* syscalls to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * called on the context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) * them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) void exit_aio(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) struct ctx_rq_wait wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) int i, skipped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) if (!table)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) atomic_set(&wait.count, table->nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) init_completion(&wait.comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) skipped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) for (i = 0; i < table->nr; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) struct kioctx *ctx =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) rcu_dereference_protected(table->table[i], true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) if (!ctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) skipped++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) * We don't need to bother with munmap() here - exit_mmap(mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) * is coming and it'll unmap everything. And we simply can't,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) * this is not necessarily our ->mm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) * Since kill_ioctx() uses non-zero ->mmap_size as indicator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) * that it needs to unmap the area, just set it to 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) ctx->mmap_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) kill_ioctx(mm, ctx, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) if (!atomic_sub_and_test(skipped, &wait.count)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) /* Wait until all IO for the context are done. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) wait_for_completion(&wait.comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) RCU_INIT_POINTER(mm->ioctx_table, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) kfree(table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) static void put_reqs_available(struct kioctx *ctx, unsigned nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) struct kioctx_cpu *kcpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) kcpu = this_cpu_ptr(ctx->cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) kcpu->reqs_available += nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) while (kcpu->reqs_available >= ctx->req_batch * 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) kcpu->reqs_available -= ctx->req_batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) atomic_add(ctx->req_batch, &ctx->reqs_available);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) static bool __get_reqs_available(struct kioctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) struct kioctx_cpu *kcpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) kcpu = this_cpu_ptr(ctx->cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) if (!kcpu->reqs_available) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) int old, avail = atomic_read(&ctx->reqs_available);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) if (avail < ctx->req_batch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) old = avail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) avail = atomic_cmpxchg(&ctx->reqs_available,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) avail, avail - ctx->req_batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) } while (avail != old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) kcpu->reqs_available += ctx->req_batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) kcpu->reqs_available--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) /* refill_reqs_available
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * Updates the reqs_available reference counts used for tracking the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) * number of free slots in the completion ring. This can be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * from aio_complete() (to optimistically update reqs_available) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) * from aio_get_req() (the we're out of events case). It must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) * called holding ctx->completion_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) static void refill_reqs_available(struct kioctx *ctx, unsigned head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) unsigned tail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) unsigned events_in_ring, completed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) /* Clamp head since userland can write to it. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) head %= ctx->nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) if (head <= tail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) events_in_ring = tail - head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) events_in_ring = ctx->nr_events - (head - tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) completed = ctx->completed_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) if (events_in_ring < completed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) completed -= events_in_ring;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) completed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) if (!completed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) ctx->completed_events -= completed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) put_reqs_available(ctx, completed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) /* user_refill_reqs_available
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) * Called to refill reqs_available when aio_get_req() encounters an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) * out of space in the completion ring.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) static void user_refill_reqs_available(struct kioctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) spin_lock_irq(&ctx->completion_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) if (ctx->completed_events) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) struct aio_ring *ring;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) unsigned head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) /* Access of ring->head may race with aio_read_events_ring()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) * here, but that's okay since whether we read the old version
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * or the new version, and either will be valid. The important
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * part is that head cannot pass tail since we prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) * aio_complete() from updating tail by holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * ctx->completion_lock. Even if head is invalid, the check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * against ctx->completed_events below will make sure we do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * safe/right thing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) ring = kmap_atomic(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) head = ring->head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) kunmap_atomic(ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) refill_reqs_available(ctx, head, ctx->tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) spin_unlock_irq(&ctx->completion_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) static bool get_reqs_available(struct kioctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) if (__get_reqs_available(ctx))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) user_refill_reqs_available(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) return __get_reqs_available(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) /* aio_get_req
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) * Allocate a slot for an aio request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) * Returns NULL if no requests are free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) * The refcount is initialized to 2 - one for the async op completion,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) * one for the synchronous code that does this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) struct aio_kiocb *req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if (unlikely(!req))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) if (unlikely(!get_reqs_available(ctx))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) kmem_cache_free(kiocb_cachep, req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) percpu_ref_get(&ctx->reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) req->ki_ctx = ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) INIT_LIST_HEAD(&req->ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) refcount_set(&req->ki_refcnt, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) req->ki_eventfd = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) return req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) static struct kioctx *lookup_ioctx(unsigned long ctx_id)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) struct aio_ring __user *ring = (void __user *)ctx_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) struct mm_struct *mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) struct kioctx *ctx, *ret = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) struct kioctx_table *table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) unsigned id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) if (get_user(id, &ring->id))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) table = rcu_dereference(mm->ioctx_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) if (!table || id >= table->nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) id = array_index_nospec(id, table->nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) ctx = rcu_dereference(table->table[id]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) if (ctx && ctx->user_id == ctx_id) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) if (percpu_ref_tryget_live(&ctx->users))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) ret = ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) static inline void iocb_destroy(struct aio_kiocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) if (iocb->ki_eventfd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) eventfd_ctx_put(iocb->ki_eventfd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) if (iocb->ki_filp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) fput(iocb->ki_filp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) percpu_ref_put(&iocb->ki_ctx->reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) kmem_cache_free(kiocb_cachep, iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) /* aio_complete
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) * Called when the io request on the given iocb is complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) static void aio_complete(struct aio_kiocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) struct kioctx *ctx = iocb->ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) struct aio_ring *ring;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) struct io_event *ev_page, *event;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) unsigned tail, pos, head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) * Add a completion event to the ring buffer. Must be done holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) * ctx->completion_lock to prevent other code from messing with the tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * pointer since we might be called from irq context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) spin_lock_irqsave(&ctx->completion_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) tail = ctx->tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) pos = tail + AIO_EVENTS_OFFSET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) if (++tail >= ctx->nr_events)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) tail = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) event = ev_page + pos % AIO_EVENTS_PER_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) *event = iocb->ki_res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) kunmap_atomic(ev_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) (void __user *)(unsigned long)iocb->ki_res.obj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) /* after flagging the request as done, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) * must never even look at it again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) smp_wmb(); /* make event visible before updating tail */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) ctx->tail = tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) ring = kmap_atomic(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) head = ring->head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) ring->tail = tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) kunmap_atomic(ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) flush_dcache_page(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) ctx->completed_events++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) if (ctx->completed_events > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) refill_reqs_available(ctx, head, tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) spin_unlock_irqrestore(&ctx->completion_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) pr_debug("added to ring %p at [%u]\n", iocb, tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) * Check if the user asked us to deliver the result through an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) * eventfd. The eventfd_signal() function is safe to be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * from IRQ context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) if (iocb->ki_eventfd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) eventfd_signal(iocb->ki_eventfd, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * We have to order our ring_info tail store above and test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * of the wait list below outside the wait lock. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) * like in wake_up_bit() where clearing a bit has to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) * ordered with the unlocked test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) smp_mb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) if (waitqueue_active(&ctx->wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) wake_up(&ctx->wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) static inline void iocb_put(struct aio_kiocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) if (refcount_dec_and_test(&iocb->ki_refcnt)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) aio_complete(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) iocb_destroy(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) /* aio_read_events_ring
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) * Pull an event off of the ioctx's event ring. Returns the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) * events fetched
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) static long aio_read_events_ring(struct kioctx *ctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) struct io_event __user *event, long nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) struct aio_ring *ring;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) unsigned head, tail, pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) int copy_ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * The mutex can block and wake us up and that will cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) * wait_event_interruptible_hrtimeout() to schedule without sleeping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) * and repeat. This should be rare enough that it doesn't cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * peformance issues. See the comment in read_events() for more detail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) sched_annotate_sleep();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) mutex_lock(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) /* Access to ->ring_pages here is protected by ctx->ring_lock. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) ring = kmap_atomic(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) head = ring->head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) tail = ring->tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) kunmap_atomic(ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * Ensure that once we've read the current tail pointer, that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * we also see the events that were stored up to the tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) if (head == tail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) head %= ctx->nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) tail %= ctx->nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) while (ret < nr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) long avail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) struct io_event *ev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) avail = (head <= tail ? tail : ctx->nr_events) - head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) if (head == tail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) pos = head + AIO_EVENTS_OFFSET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) pos %= AIO_EVENTS_PER_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) avail = min(avail, nr - ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) ev = kmap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) copy_ret = copy_to_user(event + ret, ev + pos,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) sizeof(*ev) * avail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) kunmap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) if (unlikely(copy_ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) ret += avail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) head += avail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) head %= ctx->nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) ring = kmap_atomic(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) ring->head = head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) kunmap_atomic(ring);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) flush_dcache_page(ctx->ring_pages[0]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) pr_debug("%li h%u t%u\n", ret, head, tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) mutex_unlock(&ctx->ring_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) struct io_event __user *event, long *i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) if (ret > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) *i += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) if (unlikely(atomic_read(&ctx->dead)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) if (!*i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) *i = ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) return ret < 0 || *i >= min_nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) static long read_events(struct kioctx *ctx, long min_nr, long nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) struct io_event __user *event,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) ktime_t until)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * Note that aio_read_events() is being called as the conditional - i.e.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) * we're calling it after prepare_to_wait() has set task state to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) * TASK_INTERRUPTIBLE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) * But aio_read_events() can block, and if it blocks it's going to flip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) * the task state back to TASK_RUNNING.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) * This should be ok, provided it doesn't flip the state back to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) * TASK_RUNNING and return 0 too much - that causes us to spin. That
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) * will only happen if the mutex_lock() call blocks, and we then find
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) * the ringbuffer empty. So in practice we should be ok, but it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) * something to be aware of when touching this code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) if (until == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) aio_read_events(ctx, min_nr, nr, event, &ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) wait_event_interruptible_hrtimeout(ctx->wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) aio_read_events(ctx, min_nr, nr, event, &ret),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) until);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) /* sys_io_setup:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) * Create an aio_context capable of receiving at least nr_events.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) * ctxp must not point to an aio_context that already exists, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) * must be initialized to 0 prior to the call. On successful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) * creation of the aio_context, *ctxp is filled in with the resulting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) * handle. May fail with -EINVAL if *ctxp is not initialized,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) * if the specified nr_events exceeds internal limits. May fail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) * with -EAGAIN if the specified nr_events exceeds the user's limit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) * of available events. May fail with -ENOMEM if insufficient kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) * resources are available. May fail with -EFAULT if an invalid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) * pointer is passed for ctxp. Will fail with -ENOSYS if not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) * implemented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) struct kioctx *ioctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) unsigned long ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) ret = get_user(ctx, ctxp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) if (unlikely(ret))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) if (unlikely(ctx || nr_events == 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) pr_debug("EINVAL: ctx %lu nr_events %u\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) ctx, nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) ioctx = ioctx_alloc(nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) ret = PTR_ERR(ioctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) if (!IS_ERR(ioctx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) ret = put_user(ioctx->user_id, ctxp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) kill_ioctx(current->mm, ioctx, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) percpu_ref_put(&ioctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) #ifdef CONFIG_COMPAT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) struct kioctx *ioctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) unsigned long ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) ret = get_user(ctx, ctx32p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) if (unlikely(ret))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) if (unlikely(ctx || nr_events == 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) pr_debug("EINVAL: ctx %lu nr_events %u\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) ctx, nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) ioctx = ioctx_alloc(nr_events);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) ret = PTR_ERR(ioctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) if (!IS_ERR(ioctx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) /* truncating is ok because it's a user address */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) ret = put_user((u32)ioctx->user_id, ctx32p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) kill_ioctx(current->mm, ioctx, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) percpu_ref_put(&ioctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) /* sys_io_destroy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) * Destroy the aio_context specified. May cancel any outstanding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) * AIOs and block on completion. Will fail with -ENOSYS if not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) * implemented. May fail with -EINVAL if the context pointed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) * is invalid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) struct kioctx *ioctx = lookup_ioctx(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) if (likely(NULL != ioctx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) struct ctx_rq_wait wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) init_completion(&wait.comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) atomic_set(&wait.count, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) /* Pass requests_done to kill_ioctx() where it can be set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) * in a thread-safe way. If we try to set it here then we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) * a race condition if two io_destroy() called simultaneously.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) ret = kill_ioctx(current->mm, ioctx, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) percpu_ref_put(&ioctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) /* Wait until all IO for the context are done. Otherwise kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) * keep using user-space buffers even if user thinks the context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) * is destroyed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) wait_for_completion(&wait.comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) pr_debug("EINVAL: invalid context id\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) static void aio_remove_iocb(struct aio_kiocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) struct kioctx *ctx = iocb->ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) spin_lock_irqsave(&ctx->ctx_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) list_del(&iocb->ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) spin_unlock_irqrestore(&ctx->ctx_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) if (!list_empty_careful(&iocb->ki_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) aio_remove_iocb(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) if (kiocb->ki_flags & IOCB_WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) struct inode *inode = file_inode(kiocb->ki_filp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) * Tell lockdep we inherited freeze protection from submission
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) * thread.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) if (S_ISREG(inode->i_mode))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) file_end_write(kiocb->ki_filp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) iocb->ki_res.res = res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) iocb->ki_res.res2 = res2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) iocb_put(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) req->ki_complete = aio_complete_rw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) req->private = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) req->ki_pos = iocb->aio_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) req->ki_flags = iocb_flags(req->ki_filp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) if (iocb->aio_flags & IOCB_FLAG_RESFD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) req->ki_flags |= IOCB_EVENTFD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) * aio_reqprio is interpreted as an I/O scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) * class and priority.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) ret = ioprio_check_cap(iocb->aio_reqprio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) pr_debug("aio ioprio check cap error: %d\n", ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) req->ki_ioprio = iocb->aio_reqprio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) req->ki_ioprio = get_current_ioprio();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) if (unlikely(ret))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) struct iovec **iovec, bool vectored, bool compat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) struct iov_iter *iter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) size_t len = iocb->aio_nbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) if (!vectored) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) *iovec = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) switch (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) case -EIOCBQUEUED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) case -ERESTARTSYS:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) case -ERESTARTNOINTR:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) case -ERESTARTNOHAND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) case -ERESTART_RESTARTBLOCK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) * There's no easy way to restart the syscall since other AIO's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) * may be already running. Just fail this IO with EINTR.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) fallthrough;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) req->ki_complete(req, ret, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) static int aio_read(struct kiocb *req, const struct iocb *iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) bool vectored, bool compat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) struct iov_iter iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) ret = aio_prep_rw(req, iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) file = req->ki_filp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) if (unlikely(!(file->f_mode & FMODE_READ)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) return -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) if (unlikely(!file->f_op->read_iter))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) aio_rw_done(req, call_read_iter(file, req, &iter));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) kfree(iovec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) static int aio_write(struct kiocb *req, const struct iocb *iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) bool vectored, bool compat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) struct iov_iter iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) ret = aio_prep_rw(req, iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) file = req->ki_filp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) if (unlikely(!(file->f_mode & FMODE_WRITE)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) return -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) if (unlikely(!file->f_op->write_iter))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) * Open-code file_start_write here to grab freeze protection,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) * which will be released by another thread in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) * aio_complete_rw(). Fool lockdep by telling it the lock got
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) * released so that it doesn't complain about the held lock when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) * we return to userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) if (S_ISREG(file_inode(file)->i_mode)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) sb_start_write(file_inode(file)->i_sb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) req->ki_flags |= IOCB_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) aio_rw_done(req, call_write_iter(file, req, &iter));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) kfree(iovec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) static void aio_fsync_work(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) const struct cred *old_cred = override_creds(iocb->fsync.creds);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) revert_creds(old_cred);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) put_cred(iocb->fsync.creds);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) iocb_put(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) bool datasync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) iocb->aio_rw_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) if (unlikely(!req->file->f_op->fsync))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) req->creds = prepare_creds();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) if (!req->creds)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) req->datasync = datasync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) INIT_WORK(&req->work, aio_fsync_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) schedule_work(&req->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) static void aio_poll_put_work(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) struct poll_iocb *req = container_of(work, struct poll_iocb, work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) iocb_put(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) * Safely lock the waitqueue which the request is on, synchronizing with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) * case where the ->poll() provider decides to free its waitqueue early.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) * Returns true on success, meaning that req->head->lock was locked, req->wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) * is on req->head, and an RCU read lock was taken. Returns false if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) * request was already removed from its waitqueue (which might no longer exist).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) static bool poll_iocb_lock_wq(struct poll_iocb *req)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) wait_queue_head_t *head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) * While we hold the waitqueue lock and the waitqueue is nonempty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) * wake_up_pollfree() will wait for us. However, taking the waitqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) * lock in the first place can race with the waitqueue being freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) * We solve this as eventpoll does: by taking advantage of the fact that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) * all users of wake_up_pollfree() will RCU-delay the actual free. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) * we enter rcu_read_lock() and see that the pointer to the queue is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) * non-NULL, we can then lock it without the memory being freed out from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) * under us, then check whether the request is still on the queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) * Keep holding rcu_read_lock() as long as we hold the queue lock, in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) * case the caller deletes the entry from the queue, leaving it empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) * In that case, only RCU prevents the queue memory from being freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) head = smp_load_acquire(&req->head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) if (head) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) spin_lock(&head->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) if (!list_empty(&req->wait.entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) spin_unlock(&head->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) static void poll_iocb_unlock_wq(struct poll_iocb *req)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) spin_unlock(&req->head->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) static void aio_poll_complete_work(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) struct poll_iocb *req = container_of(work, struct poll_iocb, work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) struct poll_table_struct pt = { ._key = req->events };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) struct kioctx *ctx = iocb->ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) __poll_t mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) if (!READ_ONCE(req->cancelled))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) mask = vfs_poll(req->file, &pt) & req->events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) * Note that ->ki_cancel callers also delete iocb from active_reqs after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) * calling ->ki_cancel. We need the ctx_lock roundtrip here to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) * synchronize with them. In the cancellation case the list_del_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) * itself is not actually needed, but harmless so we keep it in to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) * avoid further branches in the fast path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) spin_lock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) if (poll_iocb_lock_wq(req)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) if (!mask && !READ_ONCE(req->cancelled)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) * The request isn't actually ready to be completed yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) * Reschedule completion if another wakeup came in.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) if (req->work_need_resched) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) schedule_work(&req->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) req->work_need_resched = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) req->work_scheduled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) poll_iocb_unlock_wq(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) spin_unlock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) list_del_init(&req->wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) poll_iocb_unlock_wq(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) } /* else, POLLFREE has freed the waitqueue, so we must complete */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) list_del_init(&iocb->ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) iocb->ki_res.res = mangle_poll(mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) spin_unlock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) iocb_put(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) /* assumes we are called with irqs disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) static int aio_poll_cancel(struct kiocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) struct poll_iocb *req = &aiocb->poll;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) if (poll_iocb_lock_wq(req)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) WRITE_ONCE(req->cancelled, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) if (!req->work_scheduled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) schedule_work(&aiocb->poll.work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) req->work_scheduled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) poll_iocb_unlock_wq(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) } /* else, the request was force-cancelled by POLLFREE already */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) void *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) __poll_t mask = key_to_poll(key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) /* for instances that support it check for an event match first: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) if (mask && !(mask & req->events))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) * Complete the request inline if possible. This requires that three
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) * conditions be met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) * 1. An event mask must have been passed. If a plain wakeup was done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) * instead, then mask == 0 and we have to call vfs_poll() to get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) * the events, so inline completion isn't possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) * 2. The completion work must not have already been scheduled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) * 3. ctx_lock must not be busy. We have to use trylock because we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) * already hold the waitqueue lock, so this inverts the normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) * locking order. Use irqsave/irqrestore because not all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) * filesystems (e.g. fuse) call this function with IRQs disabled,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) * yet IRQs have to be disabled before ctx_lock is obtained.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) if (mask && !req->work_scheduled &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) struct kioctx *ctx = iocb->ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) list_del_init(&req->wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) list_del(&iocb->ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) iocb->ki_res.res = mangle_poll(mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) if (iocb->ki_eventfd && eventfd_signal_count()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) iocb = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) INIT_WORK(&req->work, aio_poll_put_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) schedule_work(&req->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) spin_unlock_irqrestore(&ctx->ctx_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) if (iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) iocb_put(iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) * Schedule the completion work if needed. If it was already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) * scheduled, record that another wakeup came in.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) * Don't remove the request from the waitqueue here, as it might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) * not actually be complete yet (we won't know until vfs_poll()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) * is called), and we must not miss any wakeups. POLLFREE is an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) * exception to this; see below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) if (req->work_scheduled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) req->work_need_resched = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) schedule_work(&req->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) req->work_scheduled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) * If the waitqueue is being freed early but we can't complete
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) * the request inline, we have to tear down the request as best
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) * we can. That means immediately removing the request from its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) * waitqueue and preventing all further accesses to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) * waitqueue via the request. We also need to schedule the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) * completion work (done above). Also mark the request as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) * cancelled, to potentially skip an unneeded call to ->poll().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) if (mask & POLLFREE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) WRITE_ONCE(req->cancelled, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) list_del_init(&req->wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) * Careful: this *must* be the last step, since as soon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) * as req->head is NULL'ed out, the request can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) * completed and freed, since aio_poll_complete_work()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) * will no longer need to take the waitqueue lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) smp_store_release(&req->head, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) struct aio_poll_table {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) struct poll_table_struct pt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) struct aio_kiocb *iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) bool queued;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) struct poll_table_struct *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) /* multiple wait queues per file are not supported */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) if (unlikely(pt->queued)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) pt->error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) pt->queued = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) pt->error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) pt->iocb->poll.head = head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) add_wait_queue(head, &pt->iocb->poll.wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) struct kioctx *ctx = aiocb->ki_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) struct poll_iocb *req = &aiocb->poll;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) struct aio_poll_table apt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) bool cancel = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) __poll_t mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) /* reject any unknown events outside the normal event mask. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) if ((u16)iocb->aio_buf != iocb->aio_buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) /* reject fields that are not defined for poll */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) INIT_WORK(&req->work, aio_poll_complete_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) req->head = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) req->cancelled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) req->work_scheduled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) req->work_need_resched = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) apt.pt._qproc = aio_poll_queue_proc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) apt.pt._key = req->events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) apt.iocb = aiocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) apt.queued = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) /* initialized the list so that we can do list_empty checks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) INIT_LIST_HEAD(&req->wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) init_waitqueue_func_entry(&req->wait, aio_poll_wake);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) mask = vfs_poll(req->file, &apt.pt) & req->events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) spin_lock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) if (likely(apt.queued)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) bool on_queue = poll_iocb_lock_wq(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) if (!on_queue || req->work_scheduled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) * aio_poll_wake() already either scheduled the async
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) * completion work, or completed the request inline.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) if (apt.error) /* unsupported case: multiple queues */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) cancel = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) apt.error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) if (mask || apt.error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) /* Steal to complete synchronously. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) list_del_init(&req->wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) } else if (cancel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) /* Cancel if possible (may be too late though). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) WRITE_ONCE(req->cancelled, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) } else if (on_queue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) * Actually waiting for an event, so add the request to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) * active_reqs so that it can be cancelled if needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) aiocb->ki_cancel = aio_poll_cancel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) if (on_queue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) poll_iocb_unlock_wq(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) if (mask) { /* no async, we'd stolen it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) aiocb->ki_res.res = mangle_poll(mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) apt.error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) spin_unlock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) if (mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) iocb_put(aiocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) return apt.error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) struct iocb __user *user_iocb, struct aio_kiocb *req,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) bool compat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) req->ki_filp = fget(iocb->aio_fildes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) if (unlikely(!req->ki_filp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) return -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) if (iocb->aio_flags & IOCB_FLAG_RESFD) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) struct eventfd_ctx *eventfd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) * instance of the file* now. The file descriptor must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) * an eventfd() fd, and will be signaled for each completed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) * event using the eventfd_signal() function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) eventfd = eventfd_ctx_fdget(iocb->aio_resfd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) if (IS_ERR(eventfd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) return PTR_ERR(eventfd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) req->ki_eventfd = eventfd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) pr_debug("EFAULT: aio_key\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) req->ki_res.obj = (u64)(unsigned long)user_iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) req->ki_res.data = iocb->aio_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) req->ki_res.res = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) req->ki_res.res2 = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) switch (iocb->aio_lio_opcode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) case IOCB_CMD_PREAD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) return aio_read(&req->rw, iocb, false, compat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) case IOCB_CMD_PWRITE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) return aio_write(&req->rw, iocb, false, compat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) case IOCB_CMD_PREADV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) return aio_read(&req->rw, iocb, true, compat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) case IOCB_CMD_PWRITEV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) return aio_write(&req->rw, iocb, true, compat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) case IOCB_CMD_FSYNC:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) return aio_fsync(&req->fsync, iocb, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) case IOCB_CMD_FDSYNC:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) return aio_fsync(&req->fsync, iocb, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) case IOCB_CMD_POLL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) return aio_poll(req, iocb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) bool compat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) struct aio_kiocb *req;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) struct iocb iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) /* enforce forwards compatibility on users */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) if (unlikely(iocb.aio_reserved2)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) pr_debug("EINVAL: reserve field set\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) /* prevent overflows */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) if (unlikely(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) ((ssize_t)iocb.aio_nbytes < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) )) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) pr_debug("EINVAL: overflow check\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) req = aio_get_req(ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) if (unlikely(!req))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) err = __io_submit_one(ctx, &iocb, user_iocb, req, compat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) /* Done with the synchronous reference */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) iocb_put(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) * If err is 0, we'd either done aio_complete() ourselves or have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) * arranged for that to be done asynchronously. Anything non-zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) * means that we need to destroy req ourselves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) if (unlikely(err)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) iocb_destroy(req);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) put_reqs_available(ctx, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) /* sys_io_submit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) * Queue the nr iocbs pointed to by iocbpp for processing. Returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) * the number of iocbs queued. May return -EINVAL if the aio_context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) * specified by ctx_id is invalid, if nr is < 0, if the iocb at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) * *iocbpp[0] is not properly initialized, if the operation specified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) * is invalid for the file descriptor in the iocb. May fail with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) * -EFAULT if any of the data structures point to invalid data. May
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) * fail with -EBADF if the file descriptor specified in the first
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) * iocb is invalid. May fail with -EAGAIN if insufficient resources
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) * are available to queue any iocbs. Will return 0 if nr is 0. Will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) * fail with -ENOSYS if not implemented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) struct iocb __user * __user *, iocbpp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) struct kioctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) int i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) if (unlikely(nr < 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) ctx = lookup_ioctx(ctx_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) if (unlikely(!ctx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) pr_debug("EINVAL: invalid context id\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) if (nr > ctx->nr_events)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) nr = ctx->nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) if (nr > AIO_PLUG_THRESHOLD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) for (i = 0; i < nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) struct iocb __user *user_iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) if (unlikely(get_user(user_iocb, iocbpp + i))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) ret = io_submit_one(ctx, user_iocb, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) if (nr > AIO_PLUG_THRESHOLD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) percpu_ref_put(&ctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) return i ? i : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) #ifdef CONFIG_COMPAT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) int, nr, compat_uptr_t __user *, iocbpp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) struct kioctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) int i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) if (unlikely(nr < 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) ctx = lookup_ioctx(ctx_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) if (unlikely(!ctx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) pr_debug("EINVAL: invalid context id\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) if (nr > ctx->nr_events)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) nr = ctx->nr_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) if (nr > AIO_PLUG_THRESHOLD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) for (i = 0; i < nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) compat_uptr_t user_iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) if (unlikely(get_user(user_iocb, iocbpp + i))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) if (nr > AIO_PLUG_THRESHOLD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) percpu_ref_put(&ctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) return i ? i : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) /* sys_io_cancel:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) * Attempts to cancel an iocb previously passed to io_submit. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) * the operation is successfully cancelled, the resulting event is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) * copied into the memory pointed to by result without being placed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) * into the completion queue and 0 is returned. May fail with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) * -EFAULT if any of the data structures pointed to are invalid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) * May fail with -EINVAL if aio_context specified by ctx_id is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) * invalid. May fail with -EAGAIN if the iocb specified was not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) * cancelled. Will fail with -ENOSYS if not implemented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) struct io_event __user *, result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) struct kioctx *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) struct aio_kiocb *kiocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) int ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) u32 key;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) u64 obj = (u64)(unsigned long)iocb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) if (unlikely(get_user(key, &iocb->aio_key)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) if (unlikely(key != KIOCB_KEY))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) ctx = lookup_ioctx(ctx_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) if (unlikely(!ctx))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) spin_lock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) /* TODO: use a hash or array, this sucks. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) if (kiocb->ki_res.obj == obj) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) ret = kiocb->ki_cancel(&kiocb->rw);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) list_del_init(&kiocb->ki_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) spin_unlock_irq(&ctx->ctx_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) * The result argument is no longer used - the io_event is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) * always delivered via the ring buffer. -EINPROGRESS indicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) * cancellation is progress:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) ret = -EINPROGRESS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) percpu_ref_put(&ctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) static long do_io_getevents(aio_context_t ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) long min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) long nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) struct io_event __user *events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) struct timespec64 *ts)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) struct kioctx *ioctx = lookup_ioctx(ctx_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) long ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) if (likely(ioctx)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) if (likely(min_nr <= nr && min_nr >= 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) ret = read_events(ioctx, min_nr, nr, events, until);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) percpu_ref_put(&ioctx->users);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) /* io_getevents:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) * Attempts to read at least min_nr events and up to nr events from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) * the completion queue for the aio_context specified by ctx_id. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) * it succeeds, the number of read events is returned. May fail with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) * out of range, if timeout is out of range. May fail with -EFAULT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) * if any of the memory specified is invalid. May return 0 or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) * < min_nr if the timeout specified by timeout has elapsed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) * before sufficient events are available, where timeout == NULL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) * specifies an infinite timeout. Note that the timeout pointed to by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) * timeout is relative. Will fail with -ENOSYS if not implemented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) #ifdef CONFIG_64BIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) long, min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) long, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) struct io_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) struct __kernel_timespec __user *, timeout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) struct timespec64 ts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) if (timeout && unlikely(get_timespec64(&ts, timeout)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) if (!ret && signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) struct __aio_sigset {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) const sigset_t __user *sigmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) size_t sigsetsize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) SYSCALL_DEFINE6(io_pgetevents,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) aio_context_t, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) long, min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) long, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) struct io_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) struct __kernel_timespec __user *, timeout,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) const struct __aio_sigset __user *, usig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) struct __aio_sigset ksig = { NULL, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) struct timespec64 ts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) bool interrupted;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) if (timeout && unlikely(get_timespec64(&ts, timeout)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) interrupted = signal_pending(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) restore_saved_sigmask_unless(interrupted);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) if (interrupted && !ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) ret = -ERESTARTNOHAND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) SYSCALL_DEFINE6(io_pgetevents_time32,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) aio_context_t, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) long, min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) long, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) struct io_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) struct old_timespec32 __user *, timeout,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) const struct __aio_sigset __user *, usig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) struct __aio_sigset ksig = { NULL, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) struct timespec64 ts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) bool interrupted;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) interrupted = signal_pending(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) restore_saved_sigmask_unless(interrupted);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) if (interrupted && !ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) ret = -ERESTARTNOHAND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) #if defined(CONFIG_COMPAT_32BIT_TIME)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) __s32, min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) __s32, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) struct io_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) struct old_timespec32 __user *, timeout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) struct timespec64 t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) if (timeout && get_old_timespec32(&t, timeout))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) if (!ret && signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) #ifdef CONFIG_COMPAT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) struct __compat_aio_sigset {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) compat_uptr_t sigmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) compat_size_t sigsetsize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) #if defined(CONFIG_COMPAT_32BIT_TIME)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) COMPAT_SYSCALL_DEFINE6(io_pgetevents,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) compat_aio_context_t, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) compat_long_t, min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) compat_long_t, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) struct io_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) struct old_timespec32 __user *, timeout,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) const struct __compat_aio_sigset __user *, usig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) struct __compat_aio_sigset ksig = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) struct timespec64 t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) bool interrupted;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) if (timeout && get_old_timespec32(&t, timeout))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) interrupted = signal_pending(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) restore_saved_sigmask_unless(interrupted);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) if (interrupted && !ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) ret = -ERESTARTNOHAND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) compat_aio_context_t, ctx_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) compat_long_t, min_nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) compat_long_t, nr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) struct io_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) struct __kernel_timespec __user *, timeout,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) const struct __compat_aio_sigset __user *, usig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) struct __compat_aio_sigset ksig = { 0, };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) struct timespec64 t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) bool interrupted;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) if (timeout && get_timespec64(&t, timeout))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) interrupted = signal_pending(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) restore_saved_sigmask_unless(interrupted);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) if (interrupted && !ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) ret = -ERESTARTNOHAND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) #endif