^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-or-later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * fs/eventpoll.c (Efficient event retrieval implementation)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2001,...,2009 Davide Libenzi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Davide Libenzi <davidel@xmailserver.org>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/poll.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/list.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/hash.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/spinlock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/rbtree.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/wait.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/eventpoll.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/mount.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/bitops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/mutex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/anon_inodes.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/device.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/freezer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <asm/io.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <asm/mman.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/atomic.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/proc_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/seq_file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/compat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/rculist.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <net/busy_poll.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <trace/hooks/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * LOCKING:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * There are three level of locking required by epoll :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * 1) epmutex (mutex)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * 2) ep->mtx (mutex)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * 3) ep->lock (rwlock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * The acquire order is the one listed above, from 1 to 3.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * We need a rwlock (ep->lock) because we manipulate objects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * from inside the poll callback, that might be triggered from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * a wake_up() that in turn might be called from IRQ context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * So we can't sleep inside the poll callback and hence we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * a spinlock. During the event transfer loop (from kernel to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * user space) we could end up sleeping due a copy_to_user(), so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * we need a lock that will allow us to sleep. This lock is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * mutex (ep->mtx). It is acquired during the event transfer loop,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * Then we also need a global mutex to serialize eventpoll_release_file()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * and ep_free().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * This mutex is acquired by ep_free() during the epoll file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * cleanup path and it is also acquired by eventpoll_release_file()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * if a file has been pushed inside an epoll set and it is then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * It is also acquired when inserting an epoll fd onto another epoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * fd. We do this so that we walk the epoll tree and ensure that this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * insertion does not create a cycle of epoll file descriptors, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * could lead to deadlock. We need a global mutex to prevent two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * simultaneous inserts (A into B and B into A) from racing and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * constructing a cycle without either insert observing that it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * going to.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * It is necessary to acquire multiple "ep->mtx"es at once in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * case when one epoll fd is added to another. In this case, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * always acquire the locks in the order of nesting (i.e. after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * before e2->mtx). Since we disallow cycles of epoll file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * descriptors, this ensures that the mutexes are well-ordered. In
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * order to communicate this nesting to lockdep, when walking a tree
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * of epoll file descriptors, we use the current recursion depth as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * the lockdep subkey.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * It is possible to drop the "ep->mtx" and to use the global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * mutex "epmutex" (together with "ep->lock") to have it working,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * but having "ep->mtx" will make the interface more scalable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * Events that require holding "epmutex" are very rare, while for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * normal operations the epoll private "ep->mtx" will guarantee
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * a better scalability.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) /* Epoll private bits inside the event mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) /* Maximum number of nesting allowed inside epoll sets */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) #define EP_MAX_NESTS 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) #define EP_UNACTIVE_PTR ((void *) -1L)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) struct epoll_filefd {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) int fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) } __packed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * Structure used to track possible nested calls, for too deep recursions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * and loop cycles.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) struct nested_call_node {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) struct list_head llink;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) void *cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) void *ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) * This structure is used as collector for nested calls, to check for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) * maximum recursion dept and loop cycles.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) struct nested_calls {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) struct list_head tasks_call_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) spinlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * Each file descriptor added to the eventpoll interface will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * have an entry of this type linked to the "rbr" RB tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * Avoid increasing the size of this struct, there can be many thousands
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * of these on a server and we do not want this to take another cache line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) struct epitem {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /* RB tree node links this structure to the eventpoll RB tree */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) struct rb_node rbn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) /* Used to free the struct epitem */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) struct rcu_head rcu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) /* List header used to link this structure to the eventpoll ready list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) struct list_head rdllink;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * Works together "struct eventpoll"->ovflist in keeping the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * single linked chain of items.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) struct epitem *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) /* The file descriptor information this item refers to */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) struct epoll_filefd ffd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) /* Number of active wait queue attached to poll operations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) int nwait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) /* List containing poll wait queues */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) struct list_head pwqlist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) /* The "container" of this item */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) /* List header used to link this item to the "struct file" items list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) struct list_head fllink;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) /* wakeup_source used when EPOLLWAKEUP is set */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) struct wakeup_source __rcu *ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) /* The structure that describe the interested events and the source fd */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) struct epoll_event event;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * This structure is stored inside the "private_data" member of the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * structure and represents the main data structure for the eventpoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * interface.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) struct eventpoll {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * This mutex is used to ensure that files are not removed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) * while epoll is using them. This is held during the event
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) * collection loop, the file cleanup path, the epoll file exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) * code and the ctl operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) struct mutex mtx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /* Wait queue used by sys_epoll_wait() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) wait_queue_head_t wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) /* Wait queue used by file->poll() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) wait_queue_head_t poll_wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) /* List of ready file descriptors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) struct list_head rdllist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) /* Lock which protects rdllist and ovflist */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) rwlock_t lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) /* RB tree root used to store monitored fd structs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) struct rb_root_cached rbr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * This is a single linked list that chains all the "struct epitem" that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * happened while transferring ready events to userspace w/out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) * holding ->lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) struct epitem *ovflist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) /* wakeup_source used when ep_scan_ready_list is running */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) struct wakeup_source *ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) /* The user that created the eventpoll descriptor */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) struct user_struct *user;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) /* used to optimize loop detection check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) u64 gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) #ifdef CONFIG_NET_RX_BUSY_POLL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) /* used to track busy poll napi_id */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) unsigned int napi_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) #ifdef CONFIG_DEBUG_LOCK_ALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) /* tracks wakeup nests for lockdep validation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) u8 nests;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) /* Wait structure used by the poll hooks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) struct eppoll_entry {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) /* List header used to link this structure to the "struct epitem" */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) struct list_head llink;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) /* The "base" pointer is set to the container "struct epitem" */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) struct epitem *base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * Wait queue item that will be linked to the target file wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) * queue head.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) wait_queue_entry_t wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) /* The wait queue head that linked the "wait" wait queue item */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) wait_queue_head_t *whead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) /* Wrapper struct used by poll queueing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) struct ep_pqueue {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) poll_table pt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) /* Used by the ep_send_events() function as callback private data */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) struct ep_send_events_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) int maxevents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) struct epoll_event __user *events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * Configuration options available inside /proc/sys/fs/epoll/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) /* Maximum number of epoll watched descriptors, per user */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) static long max_user_watches __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * This mutex is used to serialize ep_free() and eventpoll_release_file().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) static DEFINE_MUTEX(epmutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) static u64 loop_check_gen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) /* Used to check for epoll file descriptor inclusion loops */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) static struct nested_calls poll_loop_ncalls;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) /* Slab cache used to allocate "struct epitem" */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) static struct kmem_cache *epi_cache __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) /* Slab cache used to allocate "struct eppoll_entry" */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) static struct kmem_cache *pwq_cache __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * List of files with newly added links, where we may need to limit the number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * of emanating paths. Protected by the epmutex.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) static LIST_HEAD(tfile_check_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) #ifdef CONFIG_SYSCTL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) static long long_zero;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) static long long_max = LONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) struct ctl_table epoll_table[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) .procname = "max_user_watches",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) .data = &max_user_watches,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) .maxlen = sizeof(max_user_watches),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) .mode = 0644,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) .proc_handler = proc_doulongvec_minmax,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) .extra1 = &long_zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) .extra2 = &long_max,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) #endif /* CONFIG_SYSCTL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) static const struct file_operations eventpoll_fops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) static inline int is_file_epoll(struct file *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) return f->f_op == &eventpoll_fops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) /* Setup the structure that is used as key for the RB tree */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) static inline void ep_set_ffd(struct epoll_filefd *ffd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) struct file *file, int fd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) ffd->file = file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) ffd->fd = fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) /* Compare RB tree keys */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) static inline int ep_cmp_ffd(struct epoll_filefd *p1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) struct epoll_filefd *p2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) return (p1->file > p2->file ? +1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) (p1->file < p2->file ? -1 : p1->fd - p2->fd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) /* Tells us if the item is currently linked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) static inline int ep_is_linked(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) return !list_empty(&epi->rdllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) return container_of(p, struct eppoll_entry, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) /* Get the "struct epitem" from a wait queue pointer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) return container_of(p, struct eppoll_entry, wait)->base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) /* Get the "struct epitem" from an epoll queue wrapper */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) static inline struct epitem *ep_item_from_epqueue(poll_table *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) return container_of(p, struct ep_pqueue, pt)->epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) /* Initialize the poll safe wake up structure */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) static void ep_nested_calls_init(struct nested_calls *ncalls)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) INIT_LIST_HEAD(&ncalls->tasks_call_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) spin_lock_init(&ncalls->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * ep_events_available - Checks if ready events might be available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * @ep: Pointer to the eventpoll context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * Returns: Returns a value different than zero if ready events are available,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) * or zero otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) static inline int ep_events_available(struct eventpoll *ep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) return !list_empty_careful(&ep->rdllist) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) #ifdef CONFIG_NET_RX_BUSY_POLL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) static bool ep_busy_loop_end(void *p, unsigned long start_time)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) struct eventpoll *ep = p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) return ep_events_available(ep) || busy_loop_timeout(start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * Busy poll if globally on and supporting sockets found && no events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * busy loop will return if need_resched or ep_events_available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) * we must do our busy polling with irqs enabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) static void ep_busy_loop(struct eventpoll *ep, int nonblock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) unsigned int napi_id = READ_ONCE(ep->napi_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) if (ep->napi_id)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) ep->napi_id = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) * Set epoll busy poll NAPI ID from sk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) unsigned int napi_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) struct socket *sock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) struct sock *sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) if (!net_busy_loop_on())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) sock = sock_from_file(epi->ffd.file, &err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) if (!sock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) sk = sock->sk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) if (!sk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) napi_id = READ_ONCE(sk->sk_napi_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) ep = epi->ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) /* Non-NAPI IDs can be rejected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) * or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * Nothing to do if we already have this ID
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) /* record NAPI ID for use in next busy poll */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) ep->napi_id = napi_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) #endif /* CONFIG_NET_RX_BUSY_POLL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * ep_call_nested - Perform a bound (possibly) nested call, by checking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * that the recursion limit is not exceeded, and that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) * the same nested call (by the meaning of same cookie) is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * no re-entered.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) * @ncalls: Pointer to the nested_calls structure to be used for this call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) * @nproc: Nested call core function pointer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) * @priv: Opaque data to be passed to the @nproc callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) * @cookie: Cookie to be used to identify this nested call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) * @ctx: This instance context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) * Returns: Returns the code returned by the @nproc callback, or -1 if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) * the maximum recursion limit has been exceeded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) static int ep_call_nested(struct nested_calls *ncalls,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) int (*nproc)(void *, void *, int), void *priv,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) void *cookie, void *ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) int error, call_nests = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) struct list_head *lsthead = &ncalls->tasks_call_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) struct nested_call_node *tncur;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) struct nested_call_node tnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) spin_lock_irqsave(&ncalls->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * Try to see if the current task is already inside this wakeup call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * We use a list here, since the population inside this set is always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * very much limited.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) list_for_each_entry(tncur, lsthead, llink) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) if (tncur->ctx == ctx &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) * Ops ... loop detected or maximum nest level reached.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * We abort this wake by breaking the cycle itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) error = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) /* Add the current task and cookie to the list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) tnode.ctx = ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) tnode.cookie = cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) list_add(&tnode.llink, lsthead);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) spin_unlock_irqrestore(&ncalls->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) /* Call the nested function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) error = (*nproc)(priv, cookie, call_nests);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) /* Remove the current task from the list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) spin_lock_irqsave(&ncalls->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) list_del(&tnode.llink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) spin_unlock_irqrestore(&ncalls->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * As described in commit 0ccf831cb lockdep: annotate epoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * the use of wait queues used by epoll is done in a very controlled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * manner. Wake ups can nest inside each other, but are never done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * with the same locking. For example:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) * dfd = socket(...);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) * efd1 = epoll_create();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) * efd2 = epoll_create();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) * When a packet arrives to the device underneath "dfd", the net code will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) * callback wakeup entry on that queue, and the wake_up() performed by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) * "dfd" net code will end up in ep_poll_callback(). At this point epoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) * (efd1) notices that it may have some event ready, so it needs to wake up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) * that ends up in another wake_up(), after having checked about the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) * avoid stack blasting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) * this special case of epoll.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) #ifdef CONFIG_DEBUG_LOCK_ALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) struct eventpoll *ep_src;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) u8 nests = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) * To set the subclass or nesting level for spin_lock_irqsave_nested()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) * it might be natural to create a per-cpu nest count. However, since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * schedule() in the -rt kernel, the per-cpu variable are no longer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * protected. Thus, we are introducing a per eventpoll nest field.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) * If we are not being call from ep_poll_callback(), epi is NULL and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) * we are at the first level of nesting, 0. Otherwise, we are being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) * called from ep_poll_callback() and if a previous wakeup source is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) * not an epoll file itself, we are at depth 1 since the wakeup source
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * is depth 0. If the wakeup source is a previous epoll file in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) * wakeup chain then we use its nests value and record ours as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) * nests + 1. The previous epoll file nests value is stable since its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * already holding its own poll_wait.lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) if (epi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) if ((is_file_epoll(epi->ffd.file))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) ep_src = epi->ffd.file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) nests = ep_src->nests;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) nests = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) ep->nests = nests + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) ep->nests = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) wake_up_poll(&ep->poll_wait, EPOLLIN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) static void ep_remove_wait_queue(struct eppoll_entry *pwq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) wait_queue_head_t *whead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * If it is cleared by POLLFREE, it should be rcu-safe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * If we read NULL we need a barrier paired with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * smp_store_release() in ep_poll_callback(), otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * we rely on whead->lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) whead = smp_load_acquire(&pwq->whead);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) if (whead)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) remove_wait_queue(whead, &pwq->wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * This function unregisters poll callbacks from the associated file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * descriptor. Must be called with "mtx" held (or "epmutex" if called from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * ep_free).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) struct list_head *lsthead = &epi->pwqlist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) struct eppoll_entry *pwq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) while (!list_empty(lsthead)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) list_del(&pwq->llink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) ep_remove_wait_queue(pwq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) kmem_cache_free(pwq_cache, pwq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) /* call only when ep->mtx is held */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) /* call only when ep->mtx is held */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) static inline void ep_pm_stay_awake(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) struct wakeup_source *ws = ep_wakeup_source(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) if (ws)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) __pm_stay_awake(ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) static inline bool ep_has_wakeup_source(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) return rcu_access_pointer(epi->ws) ? true : false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) /* call when ep->mtx cannot be held (ep_poll_callback) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) struct wakeup_source *ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) ws = rcu_dereference(epi->ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) if (ws)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) __pm_stay_awake(ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * ep_scan_ready_list - Scans the ready list in a way that makes possible for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * the scan code, to call f_op->poll(). Also allows for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) * O(NumReady) performance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) * @ep: Pointer to the epoll private data structure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) * @sproc: Pointer to the scan callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) * @priv: Private opaque data passed to the @sproc callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) * @depth: The current depth of recursive f_op->poll calls.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) * @ep_locked: caller already holds ep->mtx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) * Returns: The same integer error code returned by the @sproc callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) static __poll_t ep_scan_ready_list(struct eventpoll *ep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) __poll_t (*sproc)(struct eventpoll *,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) struct list_head *, void *),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) void *priv, int depth, bool ep_locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) __poll_t res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) struct epitem *epi, *nepi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) LIST_HEAD(txlist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) lockdep_assert_irqs_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * We need to lock this because we could be hit by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * eventpoll_release_file() and epoll_ctl().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) if (!ep_locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) mutex_lock_nested(&ep->mtx, depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) * Steal the ready list, and re-init the original one to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) * empty list. Also, set ep->ovflist to NULL so that events
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) * happening while looping w/out locks, are not lost. We cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) * have the poll callback to queue directly on ep->rdllist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) * because we want the "sproc" callback to be able to do it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) * in a lockless way.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) list_splice_init(&ep->rdllist, &txlist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) WRITE_ONCE(ep->ovflist, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * Now call the callback function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) res = (*sproc)(ep, &txlist, priv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) * During the time we spent inside the "sproc" callback, some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) * other events might have been queued by the poll callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * We re-insert them inside the main ready-list here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) * We need to check if the item is already in the list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * During the "sproc" callback execution time, items are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) * queued into ->ovflist but the "txlist" might already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * contain them, and the list_splice() below takes care of them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) if (!ep_is_linked(epi)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) * ->ovflist is LIFO, so we have to reverse it in order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * to keep in FIFO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) list_add(&epi->rdllink, &ep->rdllist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) ep_pm_stay_awake(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * releasing the lock, events will be queued in the normal way inside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * ep->rdllist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) * Quickly re-inject items left on "txlist".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) list_splice(&txlist, &ep->rdllist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) __pm_relax(ep->ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) if (!list_empty(&ep->rdllist)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) if (waitqueue_active(&ep->wq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) wake_up(&ep->wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) if (!ep_locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) static void epi_rcu_free(struct rcu_head *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) struct epitem *epi = container_of(head, struct epitem, rcu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) kmem_cache_free(epi_cache, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) * Removes a "struct epitem" from the eventpoll RB tree and deallocates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) * all the associated resources. Must be called with "mtx" held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) static int ep_remove(struct eventpoll *ep, struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) struct file *file = epi->ffd.file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) lockdep_assert_irqs_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) * Removes poll wait queue hooks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) ep_unregister_pollwait(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) /* Remove the current item from the list of epoll hooks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) spin_lock(&file->f_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) list_del_rcu(&epi->fllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) spin_unlock(&file->f_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) rb_erase_cached(&epi->rbn, &ep->rbr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) if (ep_is_linked(epi))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) list_del_init(&epi->rdllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) wakeup_source_unregister(ep_wakeup_source(epi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) * At this point it is safe to free the eventpoll item. Use the union
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) * field epi->rcu, since we are trying to minimize the size of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) * use of the rbn field.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) call_rcu(&epi->rcu, epi_rcu_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) atomic_long_dec(&ep->user->epoll_watches);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) static void ep_free(struct eventpoll *ep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) struct rb_node *rbp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) /* We need to release all tasks waiting for these file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) if (waitqueue_active(&ep->poll_wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) ep_poll_safewake(ep, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) * We need to lock this because we could be hit by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * eventpoll_release_file() while we're freeing the "struct eventpoll".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * We do not need to hold "ep->mtx" here because the epoll file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * is on the way to be removed and no one has references to it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) * anymore. The only hit might come from eventpoll_release_file() but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) * holding "epmutex" is sufficient here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) mutex_lock(&epmutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) * Walks through the whole tree by unregistering poll callbacks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) epi = rb_entry(rbp, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) ep_unregister_pollwait(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) * Walks through the whole tree by freeing each "struct epitem". At this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) * point we are sure no poll callbacks will be lingering around, and also by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) * holding "epmutex" we can be sure that no file cleanup code will hit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) * us during this operation. So we can avoid the lock on "ep->lock".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) * We do not need to lock ep->mtx, either, we only do it to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) * a lockdep warning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) mutex_lock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) epi = rb_entry(rbp, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) ep_remove(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) mutex_unlock(&epmutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) mutex_destroy(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) free_uid(ep->user);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) wakeup_source_unregister(ep->ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) kfree(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) static int ep_eventpoll_release(struct inode *inode, struct file *file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) struct eventpoll *ep = file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) if (ep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) ep_free(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) void *priv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) poll_table *pt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) * Differs from ep_eventpoll_poll() in that internal callers already have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) * is correctly annotated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) int depth)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) bool locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) pt->_key = epi->event.events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) if (!is_file_epoll(epi->ffd.file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) return vfs_poll(epi->ffd.file, pt) & epi->event.events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) ep = epi->ffd.file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) poll_wait(epi->ffd.file, &ep->poll_wait, pt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) locked = pt && (pt->_qproc == ep_ptable_queue_proc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) return ep_scan_ready_list(epi->ffd.file->private_data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) ep_read_events_proc, &depth, depth,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) locked) & epi->event.events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) void *priv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) struct epitem *epi, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) poll_table pt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) int depth = *(int *)priv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) init_poll_funcptr(&pt, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) depth++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) list_for_each_entry_safe(epi, tmp, head, rdllink) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) if (ep_item_poll(epi, &pt, depth)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) return EPOLLIN | EPOLLRDNORM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) * Item has been dropped into the ready list by the poll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) * callback, but it's not actually ready, as far as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) * caller requested events goes. We can remove it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) __pm_relax(ep_wakeup_source(epi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) list_del_init(&epi->rdllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) struct eventpoll *ep = file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) int depth = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) /* Insert inside our poll wait queue */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) poll_wait(file, &ep->poll_wait, wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) * Proceed to find out if wanted events are really available inside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) * the ready list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) return ep_scan_ready_list(ep, ep_read_events_proc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) &depth, depth, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) #ifdef CONFIG_PROC_FS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) static void ep_show_fdinfo(struct seq_file *m, struct file *f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) struct eventpoll *ep = f->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) struct rb_node *rbp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) mutex_lock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) struct inode *inode = file_inode(epi->ffd.file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) seq_printf(m, "tfd: %8d events: %8x data: %16llx "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) " pos:%lli ino:%lx sdev:%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) epi->ffd.fd, epi->event.events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) (long long)epi->event.data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) (long long)epi->ffd.file->f_pos,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) inode->i_ino, inode->i_sb->s_dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) if (seq_has_overflowed(m))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) /* File callbacks that implement the eventpoll file behaviour */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) static const struct file_operations eventpoll_fops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) #ifdef CONFIG_PROC_FS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) .show_fdinfo = ep_show_fdinfo,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) .release = ep_eventpoll_release,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) .poll = ep_eventpoll_poll,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) .llseek = noop_llseek,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) * This is called from eventpoll_release() to unlink files from the eventpoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) * interface. We need to have this facility to cleanup correctly files that are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) * closed without being removed from the eventpoll interface.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) void eventpoll_release_file(struct file *file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) struct epitem *epi, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) * We don't want to get "file->f_lock" because it is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) * necessary. It is not necessary because we're in the "struct file"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * cleanup path, and this means that no one is using this file anymore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * So, for example, epoll_ctl() cannot hit here since if we reach this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) * point, the file counter already went to zero and fget() would fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * The only hit might come from ep_free() but by holding the mutex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * will correctly serialize the operation. We do need to acquire
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * "ep->mtx" after "epmutex" because ep_remove() requires it when called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * from anywhere but ep_free().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) * Besides, ep_remove() acquires the lock, so we can't hold it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) mutex_lock(&epmutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) ep = epi->ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) mutex_lock_nested(&ep->mtx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) ep_remove(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) mutex_unlock(&epmutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) static int ep_alloc(struct eventpoll **pep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) struct user_struct *user;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) user = get_current_user();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) ep = kzalloc(sizeof(*ep), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) if (unlikely(!ep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) goto free_uid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) mutex_init(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) rwlock_init(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) init_waitqueue_head(&ep->wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) init_waitqueue_head(&ep->poll_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) INIT_LIST_HEAD(&ep->rdllist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) ep->rbr = RB_ROOT_CACHED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) ep->ovflist = EP_UNACTIVE_PTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) ep->user = user;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) *pep = ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) free_uid:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) free_uid(user);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) * Search the file inside the eventpoll tree. The RB tree operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) * are protected by the "mtx" mutex, and ep_find() must be called with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * "mtx" held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) int kcmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) struct rb_node *rbp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) struct epitem *epi, *epir = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) struct epoll_filefd ffd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) ep_set_ffd(&ffd, file, fd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) epi = rb_entry(rbp, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) if (kcmp > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) rbp = rbp->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) else if (kcmp < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) rbp = rbp->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) epir = epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) return epir;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) #ifdef CONFIG_KCMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) struct rb_node *rbp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) epi = rb_entry(rbp, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) if (epi->ffd.fd == tfd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) if (toff == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) return epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) toff--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) unsigned long toff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) struct file *file_raw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) if (!is_file_epoll(file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) ep = file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) mutex_lock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) epi = ep_find_tfd(ep, tfd, toff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) if (epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) file_raw = epi->ffd.file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) file_raw = ERR_PTR(-ENOENT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) return file_raw;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) #endif /* CONFIG_KCMP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) * Adds a new entry to the tail of the list in a lockless way, i.e.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) * multiple CPUs are allowed to call this function concurrently.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) * Beware: it is necessary to prevent any other modifications of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) * existing list until all changes are completed, in other words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) * concurrent list_add_tail_lockless() calls should be protected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) * with a read lock, where write lock acts as a barrier which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) * makes sure all list_add_tail_lockless() calls are fully
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * completed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * Also an element can be locklessly added to the list only in one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) * direction i.e. either to the tail either to the head, otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) * concurrent access will corrupt the list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) * Returns %false if element has been already added to the list, %true
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) * otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) static inline bool list_add_tail_lockless(struct list_head *new,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) struct list_head *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) struct list_head *prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) * This is simple 'new->next = head' operation, but cmpxchg()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) * is used in order to detect that same element has been just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) * added to the list from another CPU: the winner observes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * new->next == new.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) if (cmpxchg(&new->next, new, head) != new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * Initially ->next of a new element must be updated with the head
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * (we are inserting to the tail) and only then pointers are atomically
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) * exchanged. XCHG guarantees memory ordering, thus ->next should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) * updated before pointers are actually swapped and pointers are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) * swapped before prev->next is updated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) prev = xchg(&head->prev, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) * It is safe to modify prev->next and new->prev, because a new element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) * is added only to the tail and new->next is updated before XCHG.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) prev->next = new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) new->prev = prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) * i.e. multiple CPUs are allowed to call this function concurrently.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) * Returns %false if epi element has been already chained, %true otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) static inline bool chain_epi_lockless(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) struct eventpoll *ep = epi->ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) /* Fast preliminary check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) if (epi->next != EP_UNACTIVE_PTR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) /* Check that the same epi has not been just chained from another CPU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) /* Atomically exchange tail */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) epi->next = xchg(&ep->ovflist, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) * This is the callback that is passed to the wait queue wakeup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) * mechanism. It is called by the stored file descriptors when they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * have events to report.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) * This callback takes a read lock in order not to content with concurrent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) * events from another file descriptors, thus all modifications to ->rdllist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) * or ->ovflist are lockless. Read lock is paired with the write lock from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) * ep_scan_ready_list(), which stops all list modifications and guarantees
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) * that lists state is seen correctly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) * Another thing worth to mention is that ep_poll_callback() can be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) * concurrently for the same @epi from different CPUs if poll table was inited
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) * with several wait queues entries. Plural wakeup from different CPUs of a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * single wait queue is serialized by wq.lock, but the case when multiple wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * queues are used should be detected accordingly. This is detected using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) * cmpxchg() operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) int pwake = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) struct epitem *epi = ep_item_from_wait(wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) struct eventpoll *ep = epi->ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) __poll_t pollflags = key_to_poll(key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) int ewake = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) read_lock_irqsave(&ep->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) ep_set_busy_poll_napi_id(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) * If the event mask does not contain any poll(2) event, we consider the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) * descriptor to be disabled. This condition is likely the effect of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * EPOLLONESHOT bit that disables the descriptor when an event is received,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * until the next EPOLL_CTL_MOD will be issued.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) if (!(epi->event.events & ~EP_PRIVATE_BITS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) * Check the events coming with the callback. At this stage, not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) * every device reports the events in the "key" parameter of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) * callback. We need to be able to handle both cases here, hence the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) * test for "key" != NULL before the event match test.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) if (pollflags && !(pollflags & epi->event.events))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) * If we are transferring events to userspace, we can hold no locks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) * (because we're accessing user memory, and because of linux f_op->poll()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) * semantics). All the events that happen during that period of time are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) * chained in ep->ovflist and requeued later on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) if (chain_epi_lockless(epi))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) ep_pm_stay_awake_rcu(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) } else if (!ep_is_linked(epi)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) /* In the usual case, add event to ready list. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) ep_pm_stay_awake_rcu(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) * Wake up ( if active ) both the eventpoll wait list and the ->poll()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) * wait list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) if (waitqueue_active(&ep->wq)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) if ((epi->event.events & EPOLLEXCLUSIVE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) !(pollflags & POLLFREE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) switch (pollflags & EPOLLINOUT_BITS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) case EPOLLIN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) if (epi->event.events & EPOLLIN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) ewake = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) case EPOLLOUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) if (epi->event.events & EPOLLOUT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) ewake = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) case 0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) ewake = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) wake_up(&ep->wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) if (waitqueue_active(&ep->poll_wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) pwake++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) read_unlock_irqrestore(&ep->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) /* We have to call this outside the lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) if (pwake)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) ep_poll_safewake(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) if (!(epi->event.events & EPOLLEXCLUSIVE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) ewake = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) if (pollflags & POLLFREE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) * If we race with ep_remove_wait_queue() it can miss
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) * ->whead = NULL and do another remove_wait_queue() after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) * us, so we can't use __remove_wait_queue().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) list_del_init(&wait->entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) * ->whead != NULL protects us from the race with ep_free()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) * or ep_remove(), ep_remove_wait_queue() takes whead->lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) * held by the caller. Once we nullify it, nothing protects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) * ep/epi or even wait.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) return ewake;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) * This is the callback that is used to add our wait queue to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) * target file wakeup lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) poll_table *pt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) struct epitem *epi = ep_item_from_epqueue(pt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) struct eppoll_entry *pwq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) pwq->whead = whead;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) pwq->base = epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) if (epi->event.events & EPOLLEXCLUSIVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) add_wait_queue_exclusive(whead, &pwq->wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) add_wait_queue(whead, &pwq->wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) list_add_tail(&pwq->llink, &epi->pwqlist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) epi->nwait++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) /* We have to signal that an error occurred */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) epi->nwait = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) int kcmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) struct epitem *epic;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) bool leftmost = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) while (*p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) parent = *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) epic = rb_entry(parent, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) if (kcmp > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) p = &parent->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) leftmost = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) p = &parent->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) rb_link_node(&epi->rbn, parent, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) #define PATH_ARR_SIZE 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) * These are the number paths of length 1 to 5, that we are allowing to emanate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) * from a single file of interest. For example, we allow 1000 paths of length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) * 1, to emanate from each file of interest. This essentially represents the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) * potential wakeup paths, which need to be limited in order to avoid massive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) * uncontrolled wakeup storms. The common use case should be a single ep which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) * is connected to n file sources. In this case each file source has 1 path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) * of length 1. Thus, the numbers below should be more than sufficient. These
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) * and delete can't add additional paths. Protected by the epmutex.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) static int path_count[PATH_ARR_SIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) static int path_count_inc(int nests)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) /* Allow an arbitrary number of depth 1 paths */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) if (nests == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) if (++path_count[nests] > path_limits[nests])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) static void path_count_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) for (i = 0; i < PATH_ARR_SIZE; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) path_count[i] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) int error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) struct file *file = priv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) struct file *child_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) /* CTL_DEL can remove links here, but that can't increase our count */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) child_file = epi->ep->file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) if (is_file_epoll(child_file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) if (list_empty(&child_file->f_ep_links)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) if (path_count_inc(call_nests)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) error = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) error = ep_call_nested(&poll_loop_ncalls,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) reverse_path_check_proc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) child_file, child_file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) if (error != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) printk(KERN_ERR "reverse_path_check_proc: "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) "file is not an ep!\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) * reverse_path_check - The tfile_check_list is list of file *, which have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) * links that are proposed to be newly added. We need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) * make sure that those added links don't add too many
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) * paths such that we will spend all our time waking up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) * eventpoll objects.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) * Returns: Returns zero if the proposed links don't create too many paths,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) * -1 otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) static int reverse_path_check(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) int error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) struct file *current_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) /* let's call this for all tfiles */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) path_count_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) error = ep_call_nested(&poll_loop_ncalls,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) reverse_path_check_proc, current_file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) current_file, current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) static int ep_create_wakeup_source(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) struct name_snapshot n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) struct wakeup_source *ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) char ws_name[64];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) strlcpy(ws_name, "eventpoll", sizeof(ws_name));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) if (!epi->ep->ws) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) epi->ep->ws = wakeup_source_register(NULL, ws_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) if (!epi->ep->ws)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) strlcpy(ws_name, n.name.name, sizeof(ws_name));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) ws = wakeup_source_register(NULL, ws_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) release_dentry_name_snapshot(&n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) if (!ws)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) rcu_assign_pointer(epi->ws, ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) static noinline void ep_destroy_wakeup_source(struct epitem *epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) struct wakeup_source *ws = ep_wakeup_source(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) RCU_INIT_POINTER(epi->ws, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) * used internally by wakeup_source_remove, too (called by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) * wakeup_source_unregister), so we cannot use call_rcu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) synchronize_rcu();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) wakeup_source_unregister(ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) * Must be called with "mtx" held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) struct file *tfile, int fd, int full_check)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) int error, pwake = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) __poll_t revents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) long user_watches;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) struct ep_pqueue epq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) lockdep_assert_irqs_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) user_watches = atomic_long_read(&ep->user->epoll_watches);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) if (unlikely(user_watches >= max_user_watches))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) /* Item initialization follow here ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) INIT_LIST_HEAD(&epi->rdllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) INIT_LIST_HEAD(&epi->fllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) INIT_LIST_HEAD(&epi->pwqlist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) epi->ep = ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) ep_set_ffd(&epi->ffd, tfile, fd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) epi->event = *event;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) epi->nwait = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) epi->next = EP_UNACTIVE_PTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) if (epi->event.events & EPOLLWAKEUP) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) error = ep_create_wakeup_source(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) goto error_create_wakeup_source;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) RCU_INIT_POINTER(epi->ws, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) /* Add the current item to the list of active epoll hook for this file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) spin_lock(&tfile->f_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) spin_unlock(&tfile->f_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) * Add the current item to the RB tree. All RB tree operations are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) * protected by "mtx", and ep_insert() is called with "mtx" held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) ep_rbtree_insert(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) /* now check if we've created too many backpaths */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) if (full_check && reverse_path_check())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) goto error_remove_epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) /* Initialize the poll table using the queue callback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) epq.epi = epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) * Attach the item to the poll hooks and get current event bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) * We can safely use the file* here because its usage count has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) * been increased by the caller of this function. Note that after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) * this operation completes, the poll callback can start hitting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) * the new item.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) revents = ep_item_poll(epi, &epq.pt, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) * We have to check if something went wrong during the poll wait queue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) * install process. Namely an allocation for a wait queue failed due
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) * high memory pressure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) if (epi->nwait < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) goto error_unregister;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) /* We have to drop the new item inside our item list to keep track of it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) /* record NAPI ID of new item if present */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) ep_set_busy_poll_napi_id(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) /* If the file is already "ready" we drop it inside the ready list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) if (revents && !ep_is_linked(epi)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) list_add_tail(&epi->rdllink, &ep->rdllist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) ep_pm_stay_awake(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) /* Notify waiting tasks that events are available */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) if (waitqueue_active(&ep->wq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) wake_up(&ep->wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) if (waitqueue_active(&ep->poll_wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) pwake++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) atomic_long_inc(&ep->user->epoll_watches);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) /* We have to call this outside the lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) if (pwake)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) ep_poll_safewake(ep, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) error_unregister:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) ep_unregister_pollwait(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) error_remove_epi:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) spin_lock(&tfile->f_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) list_del_rcu(&epi->fllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) spin_unlock(&tfile->f_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) rb_erase_cached(&epi->rbn, &ep->rbr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) * We need to do this because an event could have been arrived on some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) * allocated wait queue. Note that we don't care about the ep->ovflist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) * list, since that is used/cleaned only inside a section bound by "mtx".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) * And ep_insert() is called with "mtx" held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) if (ep_is_linked(epi))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) list_del_init(&epi->rdllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) wakeup_source_unregister(ep_wakeup_source(epi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) error_create_wakeup_source:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) kmem_cache_free(epi_cache, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) * Modify the interest event mask by dropping an event if the new mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) * has a match in the current file status. Must be called with "mtx" held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) static int ep_modify(struct eventpoll *ep, struct epitem *epi,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) const struct epoll_event *event)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) int pwake = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) poll_table pt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) lockdep_assert_irqs_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) init_poll_funcptr(&pt, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) * Set the new event interest mask before calling f_op->poll();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) * otherwise we might miss an event that happens between the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) * f_op->poll() call and the new event set registering.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) epi->event.events = event->events; /* need barrier below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) epi->event.data = event->data; /* protected by mtx */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) if (epi->event.events & EPOLLWAKEUP) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) if (!ep_has_wakeup_source(epi))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) ep_create_wakeup_source(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) } else if (ep_has_wakeup_source(epi)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) ep_destroy_wakeup_source(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) * The following barrier has two effects:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) * 1) Flush epi changes above to other CPUs. This ensures
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) * we do not miss events from ep_poll_callback if an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) * event occurs immediately after we call f_op->poll().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) * We need this because we did not take ep->lock while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) * changing epi above (but ep_poll_callback does take
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) * ep->lock).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) * 2) We also need to ensure we do not miss _past_ events
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) * when calling f_op->poll(). This barrier also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) * pairs with the barrier in wq_has_sleeper (see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) * comments for wq_has_sleeper).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) * This barrier will now guarantee ep_poll_callback or f_op->poll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) * (or both) will notice the readiness of an item.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) smp_mb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) * Get current event bits. We can safely use the file* here because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) * its usage count has been increased by the caller of this function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) * If the item is "hot" and it is not registered inside the ready
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) * list, push it inside.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) if (ep_item_poll(epi, &pt, 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) if (!ep_is_linked(epi)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) list_add_tail(&epi->rdllink, &ep->rdllist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) ep_pm_stay_awake(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) /* Notify waiting tasks that events are available */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) if (waitqueue_active(&ep->wq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) wake_up(&ep->wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) if (waitqueue_active(&ep->poll_wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) pwake++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) /* We have to call this outside the lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) if (pwake)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) ep_poll_safewake(ep, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) void *priv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) struct ep_send_events_data *esed = priv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) __poll_t revents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) struct epitem *epi, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) struct epoll_event __user *uevent = esed->events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) struct wakeup_source *ws;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) poll_table pt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) init_poll_funcptr(&pt, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) esed->res = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) * We can loop without lock because we are passed a task private list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) * Items cannot vanish during the loop because ep_scan_ready_list() is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) * holding "mtx" during this call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) lockdep_assert_held(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) list_for_each_entry_safe(epi, tmp, head, rdllink) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) if (esed->res >= esed->maxevents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) * Activate ep->ws before deactivating epi->ws to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) * triggering auto-suspend here (in case we reactive epi->ws
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) * below).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) * This could be rearranged to delay the deactivation of epi->ws
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) * instead, but then epi->ws would temporarily be out of sync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) * with ep_is_linked().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) ws = ep_wakeup_source(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) if (ws) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) if (ws->active)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) __pm_stay_awake(ep->ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) __pm_relax(ws);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) list_del_init(&epi->rdllink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) * If the event mask intersect the caller-requested one,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) * deliver the event to userspace. Again, ep_scan_ready_list()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) * is holding ep->mtx, so no operations coming from userspace
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) * can change the item.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) revents = ep_item_poll(epi, &pt, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) if (!revents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) if (__put_user(revents, &uevent->events) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) __put_user(epi->event.data, &uevent->data)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) list_add(&epi->rdllink, head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) ep_pm_stay_awake(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) if (!esed->res)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) esed->res = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) esed->res++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) uevent++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) if (epi->event.events & EPOLLONESHOT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) epi->event.events &= EP_PRIVATE_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) else if (!(epi->event.events & EPOLLET)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) * If this file has been added with Level
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) * Trigger mode, we need to insert back inside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) * the ready list, so that the next call to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) * epoll_wait() will check again the events
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) * availability. At this point, no one can insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) * into ep->rdllist besides us. The epoll_ctl()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) * callers are locked out by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) * ep_scan_ready_list() holding "mtx" and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) * poll callback will queue them in ep->ovflist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) list_add_tail(&epi->rdllink, &ep->rdllist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) ep_pm_stay_awake(epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) static int ep_send_events(struct eventpoll *ep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) struct epoll_event __user *events, int maxevents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) struct ep_send_events_data esed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) esed.maxevents = maxevents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) esed.events = events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) return esed.res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) static inline struct timespec64 ep_set_mstimeout(long ms)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) struct timespec64 now, ts = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) .tv_sec = ms / MSEC_PER_SEC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) ktime_get_ts64(&now);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) return timespec64_add_safe(now, ts);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) * ep_poll - Retrieves ready events, and delivers them to the caller supplied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) * event buffer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) * @ep: Pointer to the eventpoll context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) * @events: Pointer to the userspace buffer where the ready events should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) * stored.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * @maxevents: Size (in terms of number of events) of the caller event buffer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * @timeout: Maximum timeout for the ready events fetch operation, in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) * milliseconds. If the @timeout is zero, the function will not block,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) * while if the @timeout is less than zero, the function will block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) * until at least one event has been retrieved (or an error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) * occurred).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) * Returns: Returns the number of ready events which have been fetched, or an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) * error code, in case of error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) int maxevents, long timeout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) int res = 0, eavail, timed_out = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) u64 slack = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) wait_queue_entry_t wait;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) ktime_t expires, *to = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) lockdep_assert_irqs_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) if (timeout > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) struct timespec64 end_time = ep_set_mstimeout(timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) slack = select_estimate_accuracy(&end_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) to = &expires;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) *to = timespec64_to_ktime(end_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) } else if (timeout == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) * Avoid the unnecessary trip to the wait queue loop, if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) * caller specified a non blocking operation. We still need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) * lock because we could race and not see an epi being added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) * to the ready list while in irq callback. Thus incorrectly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) * returning 0 back to userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) timed_out = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) eavail = ep_events_available(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) goto send_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) fetch_events:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) if (!ep_events_available(ep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) ep_busy_loop(ep, timed_out);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) eavail = ep_events_available(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) if (eavail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) goto send_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) * Busy poll timed out. Drop NAPI ID for now, we can add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) * it back in when we have moved a socket with a valid NAPI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) * ID onto the ready list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) ep_reset_busy_poll_napi_id(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) * Internally init_wait() uses autoremove_wake_function(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) * thus wait entry is removed from the wait queue on each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) * wakeup. Why it is important? In case of several waiters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) * each new wakeup will hit the next waiter, giving it the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) * chance to harvest new event. Otherwise wakeup can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) * lost. This is also good performance-wise, because on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) * normal wakeup path no need to call __remove_wait_queue()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) * explicitly, thus ep->lock is not taken, which halts the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) * event delivery.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) init_wait(&wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) * Barrierless variant, waitqueue_active() is called under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) * the same lock on wakeup ep_poll_callback() side, so it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) * is safe to avoid an explicit barrier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) __set_current_state(TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) * Do the final check under the lock. ep_scan_ready_list()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) * plays with two lists (->rdllist and ->ovflist) and there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) * is always a race when both lists are empty for short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) * period of time although events are pending, so lock is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) * important.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) eavail = ep_events_available(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) if (!eavail) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) if (signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) res = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) __add_wait_queue_exclusive(&ep->wq, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) if (!eavail && !res)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) timed_out = !freezable_schedule_hrtimeout_range(to, slack,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) HRTIMER_MODE_ABS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) * We were woken up, thus go and try to harvest some events.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) * If timed out and still on the wait queue, recheck eavail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) * carefully under lock, below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) eavail = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) } while (0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) __set_current_state(TASK_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) if (!list_empty_careful(&wait.entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) write_lock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) * If the thread timed out and is not on the wait queue, it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) * means that the thread was woken up after its timeout expired
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) * before it could reacquire the lock. Thus, when wait.entry is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) * empty, it needs to harvest events.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) if (timed_out)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) eavail = list_empty(&wait.entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) __remove_wait_queue(&ep->wq, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) write_unlock_irq(&ep->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) send_events:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) if (fatal_signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) * Always short-circuit for fatal signals to allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) * threads to make a timely exit without the chance of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) * finding more events available and fetching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) * repeatedly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) res = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) * Try to transfer events to user space. In case we get 0 events and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) * there's still timeout left over, we go trying again in search of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) * more luck.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) if (!res && eavail &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) goto fetch_events;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) * API, to verify that adding an epoll file inside another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) * epoll structure, does not violate the constraints, in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) * terms of closed loops, or too deep chains (which can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) * result in excessive stack usage).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) * @priv: Pointer to the epoll file to be currently checked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) * data structure pointer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) * @call_nests: Current dept of the @ep_call_nested() call stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) * Returns: Returns zero if adding the epoll @file inside current epoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) * structure @ep does not violate the constraints, or -1 otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) int error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) struct file *file = priv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) struct eventpoll *ep = file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) struct eventpoll *ep_tovisit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) struct rb_node *rbp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) mutex_lock_nested(&ep->mtx, call_nests + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) ep->gen = loop_check_gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) epi = rb_entry(rbp, struct epitem, rbn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) if (unlikely(is_file_epoll(epi->ffd.file))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) ep_tovisit = epi->ffd.file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) if (ep_tovisit->gen == loop_check_gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) error = ep_call_nested(&poll_loop_ncalls,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) ep_loop_check_proc, epi->ffd.file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) ep_tovisit, current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) if (error != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) * If we've reached a file that is not associated with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) * an ep, then we need to check if the newly added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) * links are going to add too many wakeup paths. We do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) * this by adding it to the tfile_check_list, if it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) * not already there, and calling reverse_path_check()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) * during ep_insert().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) if (list_empty(&epi->ffd.file->f_tfile_llink)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) if (get_file_rcu(epi->ffd.file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) list_add(&epi->ffd.file->f_tfile_llink,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) &tfile_check_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) * another epoll file (represented by @ep) does not create
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) * closed loops or too deep chains.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) * @ep: Pointer to the epoll private data structure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) * @file: Pointer to the epoll file to be checked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) * Returns: Returns zero if adding the epoll @file inside current epoll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) * structure @ep does not violate the constraints, or -1 otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) static int ep_loop_check(struct eventpoll *ep, struct file *file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) return ep_call_nested(&poll_loop_ncalls,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) ep_loop_check_proc, file, ep, current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) static void clear_tfile_check_list(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) /* first clear the tfile_check_list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) while (!list_empty(&tfile_check_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) file = list_first_entry(&tfile_check_list, struct file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) f_tfile_llink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) list_del_init(&file->f_tfile_llink);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) fput(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) INIT_LIST_HEAD(&tfile_check_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) * Open an eventpoll file descriptor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) static int do_epoll_create(int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) int error, fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) struct eventpoll *ep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) struct file *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) /* Check the EPOLL_* constant for consistency. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) if (flags & ~EPOLL_CLOEXEC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) * Create the internal data structure ("struct eventpoll").
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) error = ep_alloc(&ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) if (error < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) * Creates all the items needed to setup an eventpoll file. That is,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) * a file structure and a free file descriptor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) if (fd < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) error = fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) goto out_free_ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) O_RDWR | (flags & O_CLOEXEC));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) if (IS_ERR(file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) error = PTR_ERR(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) goto out_free_fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) ep->file = file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) fd_install(fd, file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) return fd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) out_free_fd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) put_unused_fd(fd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) out_free_ep:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) ep_free(ep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) SYSCALL_DEFINE1(epoll_create1, int, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) return do_epoll_create(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) SYSCALL_DEFINE1(epoll_create, int, size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) if (size <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) return do_epoll_create(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) bool nonblock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) if (!nonblock) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) mutex_lock_nested(mutex, depth);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) if (mutex_trylock(mutex))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) bool nonblock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) int full_check = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) struct fd f, tf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) struct epitem *epi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) struct eventpoll *tep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) error = -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) f = fdget(epfd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) if (!f.file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) goto error_return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) /* Get the "struct file *" for the target file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) tf = fdget(fd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) if (!tf.file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) goto error_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) /* The target file descriptor must support poll */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) error = -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) if (!file_can_poll(tf.file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) /* Check if EPOLLWAKEUP is allowed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) if (ep_op_has_event(op))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) ep_take_care_of_epollwakeup(epds);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) * We have to check that the file structure underneath the file descriptor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) * the user passed to us _is_ an eventpoll file. And also we do not permit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) * adding an epoll file descriptor inside itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) if (f.file == tf.file || !is_file_epoll(f.file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) * Also, we do not currently supported nested exclusive wakeups.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) if (op == EPOLL_CTL_MOD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) * At this point it is safe to assume that the "private_data" contains
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) * our own data structure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) ep = f.file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) * When we insert an epoll file descriptor, inside another epoll file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) * descriptor, there is the change of creating closed loops, which are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) * better be handled here, than in more critical paths. While we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) * checking for loops we also determine the list of files reachable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) * and hang them on the tfile_check_list, so we can check that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) * haven't created too many possible wakeup paths.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) * the epoll file descriptor is attaching directly to a wakeup source,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) * unless the epoll file descriptor is nested. The purpose of taking the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) * 'epmutex' on add is to prevent complex toplogies such as loops and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) * deep wakeup paths from forming in parallel through multiple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) * EPOLL_CTL_ADD operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) if (op == EPOLL_CTL_ADD) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) if (!list_empty(&f.file->f_ep_links) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) ep->gen == loop_check_gen ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) is_file_epoll(tf.file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) error = epoll_mutex_lock(&epmutex, 0, nonblock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) loop_check_gen++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) full_check = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) if (is_file_epoll(tf.file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) error = -ELOOP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) if (ep_loop_check(ep, tf.file) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) get_file(tf.file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) list_add(&tf.file->f_tfile_llink,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) &tfile_check_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) if (is_file_epoll(tf.file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) tep = tf.file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) if (error) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) goto error_tgt_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) * above, we can be sure to be able to use the item looked up by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) * ep_find() till we release the mutex.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) epi = ep_find(ep, tf.file, fd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) switch (op) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) case EPOLL_CTL_ADD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) if (!epi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) epds->events |= EPOLLERR | EPOLLHUP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) error = ep_insert(ep, epds, tf.file, fd, full_check);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) error = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) case EPOLL_CTL_DEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) if (epi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) error = ep_remove(ep, epi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) error = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) case EPOLL_CTL_MOD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) if (epi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) if (!(epi->event.events & EPOLLEXCLUSIVE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) epds->events |= EPOLLERR | EPOLLHUP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) error = ep_modify(ep, epi, epds);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) error = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) if (tep != NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) mutex_unlock(&tep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) mutex_unlock(&ep->mtx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) error_tgt_fput:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) if (full_check) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) clear_tfile_check_list();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) loop_check_gen++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) mutex_unlock(&epmutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) fdput(tf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) error_fput:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) fdput(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) error_return:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) * The following function implements the controller interface for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) * the eventpoll file that enables the insertion/removal/change of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) * file descriptors inside the interest set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) struct epoll_event __user *, event)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) struct epoll_event epds;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) if (ep_op_has_event(op) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) copy_from_user(&epds, event, sizeof(struct epoll_event)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) return do_epoll_ctl(epfd, op, fd, &epds, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) * Implement the event wait interface for the eventpoll file. It is the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) * part of the user space epoll_wait(2).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) static int do_epoll_wait(int epfd, struct epoll_event __user *events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) int maxevents, int timeout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) struct fd f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) struct eventpoll *ep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) /* The maximum number of event must be greater than zero */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) /* Verify that the area passed by the user is writeable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) /* Get the "struct file *" for the eventpoll file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) f = fdget(epfd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) if (!f.file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) return -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) * We have to check that the file structure underneath the fd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) * the user passed to us _is_ an eventpoll file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) if (!is_file_epoll(f.file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) goto error_fput;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) * At this point it is safe to assume that the "private_data" contains
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) * our own data structure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) ep = f.file->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) /* Time to fish for events ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) error = ep_poll(ep, events, maxevents, timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) error_fput:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) fdput(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) int, maxevents, int, timeout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) return do_epoll_wait(epfd, events, maxevents, timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) * Implement the event wait interface for the eventpoll file. It is the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) * part of the user space epoll_pwait(2).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) int, maxevents, int, timeout, const sigset_t __user *, sigmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) size_t, sigsetsize)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) * If the caller wants a certain signal mask to be set during the wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) * we apply it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) error = set_user_sigmask(sigmask, sigsetsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) error = do_epoll_wait(epfd, events, maxevents, timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) restore_saved_sigmask_unless(error == -EINTR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) #ifdef CONFIG_COMPAT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) struct epoll_event __user *, events,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) int, maxevents, int, timeout,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) const compat_sigset_t __user *, sigmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) compat_size_t, sigsetsize)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) long err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) * If the caller wants a certain signal mask to be set during the wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) * we apply it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) err = set_compat_user_sigmask(sigmask, sigsetsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) err = do_epoll_wait(epfd, events, maxevents, timeout);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) restore_saved_sigmask_unless(err == -EINTR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) static int __init eventpoll_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) struct sysinfo si;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) si_meminfo(&si);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) * Allows top 4% of lomem to be allocated for epoll watches (per user).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) EP_ITEM_COST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) BUG_ON(max_user_watches < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) * Initialize the structure used to perform epoll file descriptor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * inclusion loops checks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) ep_nested_calls_init(&poll_loop_ncalls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) * We can have many thousands of epitems, so prevent this from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) * using an extra cache line on 64-bit (and smaller) CPUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) /* Allocates slab cache used to allocate "struct epitem" items */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) /* Allocates slab cache used to allocate "struct eppoll_entry" */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) pwq_cache = kmem_cache_create("eventpoll_pwq",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) fs_initcall(eventpoll_init);