LCOV - coverage.info - fs/eventpoll.c

LCOV - code coverage report

Current view:	top level - fs - eventpoll.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	12	630	1.9 %
Date:	2022-12-09 01:23:36	Functions:	2	44	4.5 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  *  fs/eventpoll.c (Efficient event retrieval implementation)
       4             :  *  Copyright (C) 2001,...,2009  Davide Libenzi
       5             :  *
       6             :  *  Davide Libenzi <davidel@xmailserver.org>
       7             :  */
       8             : 
       9             : #include <linux/init.h>
      10             : #include <linux/kernel.h>
      11             : #include <linux/sched/signal.h>
      12             : #include <linux/fs.h>
      13             : #include <linux/file.h>
      14             : #include <linux/signal.h>
      15             : #include <linux/errno.h>
      16             : #include <linux/mm.h>
      17             : #include <linux/slab.h>
      18             : #include <linux/poll.h>
      19             : #include <linux/string.h>
      20             : #include <linux/list.h>
      21             : #include <linux/hash.h>
      22             : #include <linux/spinlock.h>
      23             : #include <linux/syscalls.h>
      24             : #include <linux/rbtree.h>
      25             : #include <linux/wait.h>
      26             : #include <linux/eventpoll.h>
      27             : #include <linux/mount.h>
      28             : #include <linux/bitops.h>
      29             : #include <linux/mutex.h>
      30             : #include <linux/anon_inodes.h>
      31             : #include <linux/device.h>
      32             : #include <linux/uaccess.h>
      33             : #include <asm/io.h>
      34             : #include <asm/mman.h>
      35             : #include <linux/atomic.h>
      36             : #include <linux/proc_fs.h>
      37             : #include <linux/seq_file.h>
      38             : #include <linux/compat.h>
      39             : #include <linux/rculist.h>
      40             : #include <net/busy_poll.h>
      41             : 
      42             : /*
      43             :  * LOCKING:
      44             :  * There are three level of locking required by epoll :
      45             :  *
      46             :  * 1) epmutex (mutex)
      47             :  * 2) ep->mtx (mutex)
      48             :  * 3) ep->lock (rwlock)
      49             :  *
      50             :  * The acquire order is the one listed above, from 1 to 3.
      51             :  * We need a rwlock (ep->lock) because we manipulate objects
      52             :  * from inside the poll callback, that might be triggered from
      53             :  * a wake_up() that in turn might be called from IRQ context.
      54             :  * So we can't sleep inside the poll callback and hence we need
      55             :  * a spinlock. During the event transfer loop (from kernel to
      56             :  * user space) we could end up sleeping due a copy_to_user(), so
      57             :  * we need a lock that will allow us to sleep. This lock is a
      58             :  * mutex (ep->mtx). It is acquired during the event transfer loop,
      59             :  * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
      60             :  * Then we also need a global mutex to serialize eventpoll_release_file()
      61             :  * and ep_free().
      62             :  * This mutex is acquired by ep_free() during the epoll file
      63             :  * cleanup path and it is also acquired by eventpoll_release_file()
      64             :  * if a file has been pushed inside an epoll set and it is then
      65             :  * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
      66             :  * It is also acquired when inserting an epoll fd onto another epoll
      67             :  * fd. We do this so that we walk the epoll tree and ensure that this
      68             :  * insertion does not create a cycle of epoll file descriptors, which
      69             :  * could lead to deadlock. We need a global mutex to prevent two
      70             :  * simultaneous inserts (A into B and B into A) from racing and
      71             :  * constructing a cycle without either insert observing that it is
      72             :  * going to.
      73             :  * It is necessary to acquire multiple "ep->mtx"es at once in the
      74             :  * case when one epoll fd is added to another. In this case, we
      75             :  * always acquire the locks in the order of nesting (i.e. after
      76             :  * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
      77             :  * before e2->mtx). Since we disallow cycles of epoll file
      78             :  * descriptors, this ensures that the mutexes are well-ordered. In
      79             :  * order to communicate this nesting to lockdep, when walking a tree
      80             :  * of epoll file descriptors, we use the current recursion depth as
      81             :  * the lockdep subkey.
      82             :  * It is possible to drop the "ep->mtx" and to use the global
      83             :  * mutex "epmutex" (together with "ep->lock") to have it working,
      84             :  * but having "ep->mtx" will make the interface more scalable.
      85             :  * Events that require holding "epmutex" are very rare, while for
      86             :  * normal operations the epoll private "ep->mtx" will guarantee
      87             :  * a better scalability.
      88             :  */
      89             : 
      90             : /* Epoll private bits inside the event mask */
      91             : #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
      92             : 
      93             : #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
      94             : 
      95             : #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
      96             :                                 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
      97             : 
      98             : /* Maximum number of nesting allowed inside epoll sets */
      99             : #define EP_MAX_NESTS 4
     100             : 
     101             : #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
     102             : 
     103             : #define EP_UNACTIVE_PTR ((void *) -1L)
     104             : 
     105             : #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
     106             : 
     107             : struct epoll_filefd {
     108             :         struct file *file;
     109             :         int fd;
     110             : } __packed;
     111             : 
     112             : /* Wait structure used by the poll hooks */
     113             : struct eppoll_entry {
     114             :         /* List header used to link this structure to the "struct epitem" */
     115             :         struct eppoll_entry *next;
     116             : 
     117             :         /* The "base" pointer is set to the container "struct epitem" */
     118             :         struct epitem *base;
     119             : 
     120             :         /*
     121             :          * Wait queue item that will be linked to the target file wait
     122             :          * queue head.
     123             :          */
     124             :         wait_queue_entry_t wait;
     125             : 
     126             :         /* The wait queue head that linked the "wait" wait queue item */
     127             :         wait_queue_head_t *whead;
     128             : };
     129             : 
     130             : /*
     131             :  * Each file descriptor added to the eventpoll interface will
     132             :  * have an entry of this type linked to the "rbr" RB tree.
     133             :  * Avoid increasing the size of this struct, there can be many thousands
     134             :  * of these on a server and we do not want this to take another cache line.
     135             :  */
     136             : struct epitem {
     137             :         union {
     138             :                 /* RB tree node links this structure to the eventpoll RB tree */
     139             :                 struct rb_node rbn;
     140             :                 /* Used to free the struct epitem */
     141             :                 struct rcu_head rcu;
     142             :         };
     143             : 
     144             :         /* List header used to link this structure to the eventpoll ready list */
     145             :         struct list_head rdllink;
     146             : 
     147             :         /*
     148             :          * Works together "struct eventpoll"->ovflist in keeping the
     149             :          * single linked chain of items.
     150             :          */
     151             :         struct epitem *next;
     152             : 
     153             :         /* The file descriptor information this item refers to */
     154             :         struct epoll_filefd ffd;
     155             : 
     156             :         /* List containing poll wait queues */
     157             :         struct eppoll_entry *pwqlist;
     158             : 
     159             :         /* The "container" of this item */
     160             :         struct eventpoll *ep;
     161             : 
     162             :         /* List header used to link this item to the "struct file" items list */
     163             :         struct hlist_node fllink;
     164             : 
     165             :         /* wakeup_source used when EPOLLWAKEUP is set */
     166             :         struct wakeup_source __rcu *ws;
     167             : 
     168             :         /* The structure that describe the interested events and the source fd */
     169             :         struct epoll_event event;
     170             : };
     171             : 
     172             : /*
     173             :  * This structure is stored inside the "private_data" member of the file
     174             :  * structure and represents the main data structure for the eventpoll
     175             :  * interface.
     176             :  */
     177             : struct eventpoll {
     178             :         /*
     179             :          * This mutex is used to ensure that files are not removed
     180             :          * while epoll is using them. This is held during the event
     181             :          * collection loop, the file cleanup path, the epoll file exit
     182             :          * code and the ctl operations.
     183             :          */
     184             :         struct mutex mtx;
     185             : 
     186             :         /* Wait queue used by sys_epoll_wait() */
     187             :         wait_queue_head_t wq;
     188             : 
     189             :         /* Wait queue used by file->poll() */
     190             :         wait_queue_head_t poll_wait;
     191             : 
     192             :         /* List of ready file descriptors */
     193             :         struct list_head rdllist;
     194             : 
     195             :         /* Lock which protects rdllist and ovflist */
     196             :         rwlock_t lock;
     197             : 
     198             :         /* RB tree root used to store monitored fd structs */
     199             :         struct rb_root_cached rbr;
     200             : 
     201             :         /*
     202             :          * This is a single linked list that chains all the "struct epitem" that
     203             :          * happened while transferring ready events to userspace w/out
     204             :          * holding ->lock.
     205             :          */
     206             :         struct epitem *ovflist;
     207             : 
     208             :         /* wakeup_source used when ep_scan_ready_list is running */
     209             :         struct wakeup_source *ws;
     210             : 
     211             :         /* The user that created the eventpoll descriptor */
     212             :         struct user_struct *user;
     213             : 
     214             :         struct file *file;
     215             : 
     216             :         /* used to optimize loop detection check */
     217             :         u64 gen;
     218             :         struct hlist_head refs;
     219             : 
     220             : #ifdef CONFIG_NET_RX_BUSY_POLL
     221             :         /* used to track busy poll napi_id */
     222             :         unsigned int napi_id;
     223             : #endif
     224             : 
     225             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     226             :         /* tracks wakeup nests for lockdep validation */
     227             :         u8 nests;
     228             : #endif
     229             : };
     230             : 
     231             : /* Wrapper struct used by poll queueing */
     232             : struct ep_pqueue {
     233             :         poll_table pt;
     234             :         struct epitem *epi;
     235             : };
     236             : 
     237             : /*
     238             :  * Configuration options available inside /proc/sys/fs/epoll/
     239             :  */
     240             : /* Maximum number of epoll watched descriptors, per user */
     241             : static long max_user_watches __read_mostly;
     242             : 
     243             : /*
     244             :  * This mutex is used to serialize ep_free() and eventpoll_release_file().
     245             :  */
     246             : static DEFINE_MUTEX(epmutex);
     247             : 
     248             : static u64 loop_check_gen = 0;
     249             : 
     250             : /* Used to check for epoll file descriptor inclusion loops */
     251             : static struct eventpoll *inserting_into;
     252             : 
     253             : /* Slab cache used to allocate "struct epitem" */
     254             : static struct kmem_cache *epi_cache __read_mostly;
     255             : 
     256             : /* Slab cache used to allocate "struct eppoll_entry" */
     257             : static struct kmem_cache *pwq_cache __read_mostly;
     258             : 
     259             : /*
     260             :  * List of files with newly added links, where we may need to limit the number
     261             :  * of emanating paths. Protected by the epmutex.
     262             :  */
     263             : struct epitems_head {
     264             :         struct hlist_head epitems;
     265             :         struct epitems_head *next;
     266             : };
     267             : static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
     268             : 
     269             : static struct kmem_cache *ephead_cache __read_mostly;
     270             : 
     271             : static inline void free_ephead(struct epitems_head *head)
     272             : {
     273           0 :         if (head)
     274           0 :                 kmem_cache_free(ephead_cache, head);
     275             : }
     276             : 
     277             : static void list_file(struct file *file)
     278             : {
     279             :         struct epitems_head *head;
     280             : 
     281           0 :         head = container_of(file->f_ep, struct epitems_head, epitems);
     282           0 :         if (!head->next) {
     283           0 :                 head->next = tfile_check_list;
     284           0 :                 tfile_check_list = head;
     285             :         }
     286             : }
     287             : 
     288           0 : static void unlist_file(struct epitems_head *head)
     289             : {
     290           0 :         struct epitems_head *to_free = head;
     291           0 :         struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
     292           0 :         if (p) {
     293           0 :                 struct epitem *epi= container_of(p, struct epitem, fllink);
     294           0 :                 spin_lock(&epi->ffd.file->f_lock);
     295           0 :                 if (!hlist_empty(&head->epitems))
     296           0 :                         to_free = NULL;
     297           0 :                 head->next = NULL;
     298           0 :                 spin_unlock(&epi->ffd.file->f_lock);
     299             :         }
     300           0 :         free_ephead(to_free);
     301           0 : }
     302             : 
     303             : #ifdef CONFIG_SYSCTL
     304             : 
     305             : #include <linux/sysctl.h>
     306             : 
     307             : static long long_zero;
     308             : static long long_max = LONG_MAX;
     309             : 
     310             : static struct ctl_table epoll_table[] = {
     311             :         {
     312             :                 .procname       = "max_user_watches",
     313             :                 .data           = &max_user_watches,
     314             :                 .maxlen         = sizeof(max_user_watches),
     315             :                 .mode           = 0644,
     316             :                 .proc_handler   = proc_doulongvec_minmax,
     317             :                 .extra1         = &long_zero,
     318             :                 .extra2         = &long_max,
     319             :         },
     320             :         { }
     321             : };
     322             : 
     323           1 : static void __init epoll_sysctls_init(void)
     324             : {
     325           1 :         register_sysctl("fs/epoll", epoll_table);
     326           1 : }
     327             : #else
     328             : #define epoll_sysctls_init() do { } while (0)
     329             : #endif /* CONFIG_SYSCTL */
     330             : 
     331             : static const struct file_operations eventpoll_fops;
     332             : 
     333             : static inline int is_file_epoll(struct file *f)
     334             : {
     335           0 :         return f->f_op == &eventpoll_fops;
     336             : }
     337             : 
     338             : /* Setup the structure that is used as key for the RB tree */
     339             : static inline void ep_set_ffd(struct epoll_filefd *ffd,
     340             :                               struct file *file, int fd)
     341             : {
     342           0 :         ffd->file = file;
     343           0 :         ffd->fd = fd;
     344             : }
     345             : 
     346             : /* Compare RB tree keys */
     347             : static inline int ep_cmp_ffd(struct epoll_filefd *p1,
     348             :                              struct epoll_filefd *p2)
     349             : {
     350           0 :         return (p1->file > p2->file ? +1:
     351           0 :                 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
     352             : }
     353             : 
     354             : /* Tells us if the item is currently linked */
     355             : static inline int ep_is_linked(struct epitem *epi)
     356             : {
     357           0 :         return !list_empty(&epi->rdllink);
     358             : }
     359             : 
     360             : static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
     361             : {
     362           0 :         return container_of(p, struct eppoll_entry, wait);
     363             : }
     364             : 
     365             : /* Get the "struct epitem" from a wait queue pointer */
     366             : static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
     367             : {
     368           0 :         return container_of(p, struct eppoll_entry, wait)->base;
     369             : }
     370             : 
     371             : /**
     372             :  * ep_events_available - Checks if ready events might be available.
     373             :  *
     374             :  * @ep: Pointer to the eventpoll context.
     375             :  *
     376             :  * Return: a value different than %zero if ready events are available,
     377             :  *          or %zero otherwise.
     378             :  */
     379             : static inline int ep_events_available(struct eventpoll *ep)
     380             : {
     381           0 :         return !list_empty_careful(&ep->rdllist) ||
     382           0 :                 READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
     383             : }
     384             : 
     385             : #ifdef CONFIG_NET_RX_BUSY_POLL
     386             : static bool ep_busy_loop_end(void *p, unsigned long start_time)
     387             : {
     388             :         struct eventpoll *ep = p;
     389             : 
     390             :         return ep_events_available(ep) || busy_loop_timeout(start_time);
     391             : }
     392             : 
     393             : /*
     394             :  * Busy poll if globally on and supporting sockets found && no events,
     395             :  * busy loop will return if need_resched or ep_events_available.
     396             :  *
     397             :  * we must do our busy polling with irqs enabled
     398             :  */
     399             : static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     400             : {
     401             :         unsigned int napi_id = READ_ONCE(ep->napi_id);
     402             : 
     403             :         if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
     404             :                 napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
     405             :                                BUSY_POLL_BUDGET);
     406             :                 if (ep_events_available(ep))
     407             :                         return true;
     408             :                 /*
     409             :                  * Busy poll timed out.  Drop NAPI ID for now, we can add
     410             :                  * it back in when we have moved a socket with a valid NAPI
     411             :                  * ID onto the ready list.
     412             :                  */
     413             :                 ep->napi_id = 0;
     414             :                 return false;
     415             :         }
     416             :         return false;
     417             : }
     418             : 
     419             : /*
     420             :  * Set epoll busy poll NAPI ID from sk.
     421             :  */
     422             : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     423             : {
     424             :         struct eventpoll *ep;
     425             :         unsigned int napi_id;
     426             :         struct socket *sock;
     427             :         struct sock *sk;
     428             : 
     429             :         if (!net_busy_loop_on())
     430             :                 return;
     431             : 
     432             :         sock = sock_from_file(epi->ffd.file);
     433             :         if (!sock)
     434             :                 return;
     435             : 
     436             :         sk = sock->sk;
     437             :         if (!sk)
     438             :                 return;
     439             : 
     440             :         napi_id = READ_ONCE(sk->sk_napi_id);
     441             :         ep = epi->ep;
     442             : 
     443             :         /* Non-NAPI IDs can be rejected
     444             :          *      or
     445             :          * Nothing to do if we already have this ID
     446             :          */
     447             :         if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
     448             :                 return;
     449             : 
     450             :         /* record NAPI ID for use in next busy poll */
     451             :         ep->napi_id = napi_id;
     452             : }
     453             : 
     454             : #else
     455             : 
     456             : static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
     457             : {
     458             :         return false;
     459             : }
     460             : 
     461             : static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
     462             : {
     463             : }
     464             : 
     465             : #endif /* CONFIG_NET_RX_BUSY_POLL */
     466             : 
     467             : /*
     468             :  * As described in commit 0ccf831cb lockdep: annotate epoll
     469             :  * the use of wait queues used by epoll is done in a very controlled
     470             :  * manner. Wake ups can nest inside each other, but are never done
     471             :  * with the same locking. For example:
     472             :  *
     473             :  *   dfd = socket(...);
     474             :  *   efd1 = epoll_create();
     475             :  *   efd2 = epoll_create();
     476             :  *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
     477             :  *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
     478             :  *
     479             :  * When a packet arrives to the device underneath "dfd", the net code will
     480             :  * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
     481             :  * callback wakeup entry on that queue, and the wake_up() performed by the
     482             :  * "dfd" net code will end up in ep_poll_callback(). At this point epoll
     483             :  * (efd1) notices that it may have some event ready, so it needs to wake up
     484             :  * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
     485             :  * that ends up in another wake_up(), after having checked about the
     486             :  * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
     487             :  * avoid stack blasting.
     488             :  *
     489             :  * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
     490             :  * this special case of epoll.
     491             :  */
     492             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
     493             : 
     494             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
     495             : {
     496             :         struct eventpoll *ep_src;
     497             :         unsigned long flags;
     498             :         u8 nests = 0;
     499             : 
     500             :         /*
     501             :          * To set the subclass or nesting level for spin_lock_irqsave_nested()
     502             :          * it might be natural to create a per-cpu nest count. However, since
     503             :          * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
     504             :          * schedule() in the -rt kernel, the per-cpu variable are no longer
     505             :          * protected. Thus, we are introducing a per eventpoll nest field.
     506             :          * If we are not being call from ep_poll_callback(), epi is NULL and
     507             :          * we are at the first level of nesting, 0. Otherwise, we are being
     508             :          * called from ep_poll_callback() and if a previous wakeup source is
     509             :          * not an epoll file itself, we are at depth 1 since the wakeup source
     510             :          * is depth 0. If the wakeup source is a previous epoll file in the
     511             :          * wakeup chain then we use its nests value and record ours as
     512             :          * nests + 1. The previous epoll file nests value is stable since its
     513             :          * already holding its own poll_wait.lock.
     514             :          */
     515             :         if (epi) {
     516             :                 if ((is_file_epoll(epi->ffd.file))) {
     517             :                         ep_src = epi->ffd.file->private_data;
     518             :                         nests = ep_src->nests;
     519             :                 } else {
     520             :                         nests = 1;
     521             :                 }
     522             :         }
     523             :         spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
     524             :         ep->nests = nests + 1;
     525             :         wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
     526             :         ep->nests = 0;
     527             :         spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
     528             : }
     529             : 
     530             : #else
     531             : 
     532             : static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
     533             : {
     534           0 :         wake_up_poll(&ep->poll_wait, EPOLLIN);
     535             : }
     536             : 
     537             : #endif
     538             : 
     539             : static void ep_remove_wait_queue(struct eppoll_entry *pwq)
     540             : {
     541             :         wait_queue_head_t *whead;
     542             : 
     543             :         rcu_read_lock();
     544             :         /*
     545             :          * If it is cleared by POLLFREE, it should be rcu-safe.
     546             :          * If we read NULL we need a barrier paired with
     547             :          * smp_store_release() in ep_poll_callback(), otherwise
     548             :          * we rely on whead->lock.
     549             :          */
     550           0 :         whead = smp_load_acquire(&pwq->whead);
     551           0 :         if (whead)
     552           0 :                 remove_wait_queue(whead, &pwq->wait);
     553             :         rcu_read_unlock();
     554             : }
     555             : 
     556             : /*
     557             :  * This function unregisters poll callbacks from the associated file
     558             :  * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
     559             :  * ep_free).
     560             :  */
     561           0 : static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
     562             : {
     563           0 :         struct eppoll_entry **p = &epi->pwqlist;
     564             :         struct eppoll_entry *pwq;
     565             : 
     566           0 :         while ((pwq = *p) != NULL) {
     567           0 :                 *p = pwq->next;
     568           0 :                 ep_remove_wait_queue(pwq);
     569           0 :                 kmem_cache_free(pwq_cache, pwq);
     570             :         }
     571           0 : }
     572             : 
     573             : /* call only when ep->mtx is held */
     574             : static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
     575             : {
     576           0 :         return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
     577             : }
     578             : 
     579             : /* call only when ep->mtx is held */
     580             : static inline void ep_pm_stay_awake(struct epitem *epi)
     581             : {
     582           0 :         struct wakeup_source *ws = ep_wakeup_source(epi);
     583             : 
     584           0 :         if (ws)
     585           0 :                 __pm_stay_awake(ws);
     586             : }
     587             : 
     588             : static inline bool ep_has_wakeup_source(struct epitem *epi)
     589             : {
     590           0 :         return rcu_access_pointer(epi->ws) ? true : false;
     591             : }
     592             : 
     593             : /* call when ep->mtx cannot be held (ep_poll_callback) */
     594             : static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
     595             : {
     596             :         struct wakeup_source *ws;
     597             : 
     598             :         rcu_read_lock();
     599           0 :         ws = rcu_dereference(epi->ws);
     600           0 :         if (ws)
     601           0 :                 __pm_stay_awake(ws);
     602             :         rcu_read_unlock();
     603             : }
     604             : 
     605             : 
     606             : /*
     607             :  * ep->mutex needs to be held because we could be hit by
     608             :  * eventpoll_release_file() and epoll_ctl().
     609             :  */
     610           0 : static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
     611             : {
     612             :         /*
     613             :          * Steal the ready list, and re-init the original one to the
     614             :          * empty list. Also, set ep->ovflist to NULL so that events
     615             :          * happening while looping w/out locks, are not lost. We cannot
     616             :          * have the poll callback to queue directly on ep->rdllist,
     617             :          * because we want the "sproc" callback to be able to do it
     618             :          * in a lockless way.
     619             :          */
     620             :         lockdep_assert_irqs_enabled();
     621           0 :         write_lock_irq(&ep->lock);
     622           0 :         list_splice_init(&ep->rdllist, txlist);
     623           0 :         WRITE_ONCE(ep->ovflist, NULL);
     624           0 :         write_unlock_irq(&ep->lock);
     625           0 : }
     626             : 
     627           0 : static void ep_done_scan(struct eventpoll *ep,
     628             :                          struct list_head *txlist)
     629             : {
     630             :         struct epitem *epi, *nepi;
     631             : 
     632           0 :         write_lock_irq(&ep->lock);
     633             :         /*
     634             :          * During the time we spent inside the "sproc" callback, some
     635             :          * other events might have been queued by the poll callback.
     636             :          * We re-insert them inside the main ready-list here.
     637             :          */
     638           0 :         for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
     639           0 :              nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
     640             :                 /*
     641             :                  * We need to check if the item is already in the list.
     642             :                  * During the "sproc" callback execution time, items are
     643             :                  * queued into ->ovflist but the "txlist" might already
     644             :                  * contain them, and the list_splice() below takes care of them.
     645             :                  */
     646           0 :                 if (!ep_is_linked(epi)) {
     647             :                         /*
     648             :                          * ->ovflist is LIFO, so we have to reverse it in order
     649             :                          * to keep in FIFO.
     650             :                          */
     651           0 :                         list_add(&epi->rdllink, &ep->rdllist);
     652             :                         ep_pm_stay_awake(epi);
     653             :                 }
     654             :         }
     655             :         /*
     656             :          * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
     657             :          * releasing the lock, events will be queued in the normal way inside
     658             :          * ep->rdllist.
     659             :          */
     660           0 :         WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
     661             : 
     662             :         /*
     663             :          * Quickly re-inject items left on "txlist".
     664             :          */
     665           0 :         list_splice(txlist, &ep->rdllist);
     666           0 :         __pm_relax(ep->ws);
     667             : 
     668           0 :         if (!list_empty(&ep->rdllist)) {
     669           0 :                 if (waitqueue_active(&ep->wq))
     670           0 :                         wake_up(&ep->wq);
     671             :         }
     672             : 
     673           0 :         write_unlock_irq(&ep->lock);
     674           0 : }
     675             : 
     676           0 : static void epi_rcu_free(struct rcu_head *head)
     677             : {
     678           0 :         struct epitem *epi = container_of(head, struct epitem, rcu);
     679           0 :         kmem_cache_free(epi_cache, epi);
     680           0 : }
     681             : 
     682             : /*
     683             :  * Removes a "struct epitem" from the eventpoll RB tree and deallocates
     684             :  * all the associated resources. Must be called with "mtx" held.
     685             :  */
     686           0 : static int ep_remove(struct eventpoll *ep, struct epitem *epi)
     687             : {
     688           0 :         struct file *file = epi->ffd.file;
     689             :         struct epitems_head *to_free;
     690             :         struct hlist_head *head;
     691             : 
     692             :         lockdep_assert_irqs_enabled();
     693             : 
     694             :         /*
     695             :          * Removes poll wait queue hooks.
     696             :          */
     697           0 :         ep_unregister_pollwait(ep, epi);
     698             : 
     699             :         /* Remove the current item from the list of epoll hooks */
     700           0 :         spin_lock(&file->f_lock);
     701           0 :         to_free = NULL;
     702           0 :         head = file->f_ep;
     703           0 :         if (head->first == &epi->fllink && !epi->fllink.next) {
     704           0 :                 file->f_ep = NULL;
     705           0 :                 if (!is_file_epoll(file)) {
     706             :                         struct epitems_head *v;
     707           0 :                         v = container_of(head, struct epitems_head, epitems);
     708           0 :                         if (!smp_load_acquire(&v->next))
     709           0 :                                 to_free = v;
     710             :                 }
     711             :         }
     712           0 :         hlist_del_rcu(&epi->fllink);
     713           0 :         spin_unlock(&file->f_lock);
     714           0 :         free_ephead(to_free);
     715             : 
     716           0 :         rb_erase_cached(&epi->rbn, &ep->rbr);
     717             : 
     718           0 :         write_lock_irq(&ep->lock);
     719           0 :         if (ep_is_linked(epi))
     720           0 :                 list_del_init(&epi->rdllink);
     721           0 :         write_unlock_irq(&ep->lock);
     722             : 
     723           0 :         wakeup_source_unregister(ep_wakeup_source(epi));
     724             :         /*
     725             :          * At this point it is safe to free the eventpoll item. Use the union
     726             :          * field epi->rcu, since we are trying to minimize the size of
     727             :          * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
     728             :          * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
     729             :          * use of the rbn field.
     730             :          */
     731           0 :         call_rcu(&epi->rcu, epi_rcu_free);
     732             : 
     733           0 :         percpu_counter_dec(&ep->user->epoll_watches);
     734             : 
     735           0 :         return 0;
     736             : }
     737             : 
     738           0 : static void ep_free(struct eventpoll *ep)
     739             : {
     740             :         struct rb_node *rbp;
     741             :         struct epitem *epi;
     742             : 
     743             :         /* We need to release all tasks waiting for these file */
     744           0 :         if (waitqueue_active(&ep->poll_wait))
     745           0 :                 ep_poll_safewake(ep, NULL);
     746             : 
     747             :         /*
     748             :          * We need to lock this because we could be hit by
     749             :          * eventpoll_release_file() while we're freeing the "struct eventpoll".
     750             :          * We do not need to hold "ep->mtx" here because the epoll file
     751             :          * is on the way to be removed and no one has references to it
     752             :          * anymore. The only hit might come from eventpoll_release_file() but
     753             :          * holding "epmutex" is sufficient here.
     754             :          */
     755           0 :         mutex_lock(&epmutex);
     756             : 
     757             :         /*
     758             :          * Walks through the whole tree by unregistering poll callbacks.
     759             :          */
     760           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     761           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
     762             : 
     763           0 :                 ep_unregister_pollwait(ep, epi);
     764           0 :                 cond_resched();
     765             :         }
     766             : 
     767             :         /*
     768             :          * Walks through the whole tree by freeing each "struct epitem". At this
     769             :          * point we are sure no poll callbacks will be lingering around, and also by
     770             :          * holding "epmutex" we can be sure that no file cleanup code will hit
     771             :          * us during this operation. So we can avoid the lock on "ep->lock".
     772             :          * We do not need to lock ep->mtx, either, we only do it to prevent
     773             :          * a lockdep warning.
     774             :          */
     775           0 :         mutex_lock(&ep->mtx);
     776           0 :         while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
     777           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
     778           0 :                 ep_remove(ep, epi);
     779           0 :                 cond_resched();
     780             :         }
     781           0 :         mutex_unlock(&ep->mtx);
     782             : 
     783           0 :         mutex_unlock(&epmutex);
     784           0 :         mutex_destroy(&ep->mtx);
     785           0 :         free_uid(ep->user);
     786           0 :         wakeup_source_unregister(ep->ws);
     787           0 :         kfree(ep);
     788           0 : }
     789             : 
     790           0 : static int ep_eventpoll_release(struct inode *inode, struct file *file)
     791             : {
     792           0 :         struct eventpoll *ep = file->private_data;
     793             : 
     794           0 :         if (ep)
     795           0 :                 ep_free(ep);
     796             : 
     797           0 :         return 0;
     798             : }
     799             : 
     800             : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
     801             : 
     802           0 : static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
     803             : {
     804           0 :         struct eventpoll *ep = file->private_data;
     805           0 :         LIST_HEAD(txlist);
     806             :         struct epitem *epi, *tmp;
     807             :         poll_table pt;
     808           0 :         __poll_t res = 0;
     809             : 
     810           0 :         init_poll_funcptr(&pt, NULL);
     811             : 
     812             :         /* Insert inside our poll wait queue */
     813           0 :         poll_wait(file, &ep->poll_wait, wait);
     814             : 
     815             :         /*
     816             :          * Proceed to find out if wanted events are really available inside
     817             :          * the ready list.
     818             :          */
     819           0 :         mutex_lock_nested(&ep->mtx, depth);
     820           0 :         ep_start_scan(ep, &txlist);
     821           0 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
     822           0 :                 if (ep_item_poll(epi, &pt, depth + 1)) {
     823             :                         res = EPOLLIN | EPOLLRDNORM;
     824             :                         break;
     825             :                 } else {
     826             :                         /*
     827             :                          * Item has been dropped into the ready list by the poll
     828             :                          * callback, but it's not actually ready, as far as
     829             :                          * caller requested events goes. We can remove it here.
     830             :                          */
     831           0 :                         __pm_relax(ep_wakeup_source(epi));
     832           0 :                         list_del_init(&epi->rdllink);
     833             :                 }
     834             :         }
     835           0 :         ep_done_scan(ep, &txlist);
     836           0 :         mutex_unlock(&ep->mtx);
     837           0 :         return res;
     838             : }
     839             : 
     840             : /*
     841             :  * Differs from ep_eventpoll_poll() in that internal callers already have
     842             :  * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
     843             :  * is correctly annotated.
     844             :  */
     845           0 : static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
     846             :                                  int depth)
     847             : {
     848           0 :         struct file *file = epi->ffd.file;
     849             :         __poll_t res;
     850             : 
     851           0 :         pt->_key = epi->event.events;
     852           0 :         if (!is_file_epoll(file))
     853             :                 res = vfs_poll(file, pt);
     854             :         else
     855           0 :                 res = __ep_eventpoll_poll(file, pt, depth);
     856           0 :         return res & epi->event.events;
     857             : }
     858             : 
     859           0 : static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
     860             : {
     861           0 :         return __ep_eventpoll_poll(file, wait, 0);
     862             : }
     863             : 
     864             : #ifdef CONFIG_PROC_FS
     865           0 : static void ep_show_fdinfo(struct seq_file *m, struct file *f)
     866             : {
     867           0 :         struct eventpoll *ep = f->private_data;
     868             :         struct rb_node *rbp;
     869             : 
     870           0 :         mutex_lock(&ep->mtx);
     871           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
     872           0 :                 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
     873           0 :                 struct inode *inode = file_inode(epi->ffd.file);
     874             : 
     875           0 :                 seq_printf(m, "tfd: %8d events: %8x data: %16llx "
     876             :                            " pos:%lli ino:%lx sdev:%x\n",
     877             :                            epi->ffd.fd, epi->event.events,
     878           0 :                            (long long)epi->event.data,
     879           0 :                            (long long)epi->ffd.file->f_pos,
     880           0 :                            inode->i_ino, inode->i_sb->s_dev);
     881           0 :                 if (seq_has_overflowed(m))
     882             :                         break;
     883             :         }
     884           0 :         mutex_unlock(&ep->mtx);
     885           0 : }
     886             : #endif
     887             : 
     888             : /* File callbacks that implement the eventpoll file behaviour */
     889             : static const struct file_operations eventpoll_fops = {
     890             : #ifdef CONFIG_PROC_FS
     891             :         .show_fdinfo    = ep_show_fdinfo,
     892             : #endif
     893             :         .release        = ep_eventpoll_release,
     894             :         .poll           = ep_eventpoll_poll,
     895             :         .llseek         = noop_llseek,
     896             : };
     897             : 
     898             : /*
     899             :  * This is called from eventpoll_release() to unlink files from the eventpoll
     900             :  * interface. We need to have this facility to cleanup correctly files that are
     901             :  * closed without being removed from the eventpoll interface.
     902             :  */
     903           0 : void eventpoll_release_file(struct file *file)
     904             : {
     905             :         struct eventpoll *ep;
     906             :         struct epitem *epi;
     907             :         struct hlist_node *next;
     908             : 
     909             :         /*
     910             :          * We don't want to get "file->f_lock" because it is not
     911             :          * necessary. It is not necessary because we're in the "struct file"
     912             :          * cleanup path, and this means that no one is using this file anymore.
     913             :          * So, for example, epoll_ctl() cannot hit here since if we reach this
     914             :          * point, the file counter already went to zero and fget() would fail.
     915             :          * The only hit might come from ep_free() but by holding the mutex
     916             :          * will correctly serialize the operation. We do need to acquire
     917             :          * "ep->mtx" after "epmutex" because ep_remove() requires it when called
     918             :          * from anywhere but ep_free().
     919             :          *
     920             :          * Besides, ep_remove() acquires the lock, so we can't hold it here.
     921             :          */
     922           0 :         mutex_lock(&epmutex);
     923           0 :         if (unlikely(!file->f_ep)) {
     924           0 :                 mutex_unlock(&epmutex);
     925           0 :                 return;
     926             :         }
     927           0 :         hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
     928           0 :                 ep = epi->ep;
     929           0 :                 mutex_lock_nested(&ep->mtx, 0);
     930           0 :                 ep_remove(ep, epi);
     931           0 :                 mutex_unlock(&ep->mtx);
     932             :         }
     933           0 :         mutex_unlock(&epmutex);
     934             : }
     935             : 
     936           0 : static int ep_alloc(struct eventpoll **pep)
     937             : {
     938             :         int error;
     939             :         struct user_struct *user;
     940             :         struct eventpoll *ep;
     941             : 
     942           0 :         user = get_current_user();
     943           0 :         error = -ENOMEM;
     944           0 :         ep = kzalloc(sizeof(*ep), GFP_KERNEL);
     945           0 :         if (unlikely(!ep))
     946             :                 goto free_uid;
     947             : 
     948           0 :         mutex_init(&ep->mtx);
     949             :         rwlock_init(&ep->lock);
     950           0 :         init_waitqueue_head(&ep->wq);
     951           0 :         init_waitqueue_head(&ep->poll_wait);
     952           0 :         INIT_LIST_HEAD(&ep->rdllist);
     953           0 :         ep->rbr = RB_ROOT_CACHED;
     954           0 :         ep->ovflist = EP_UNACTIVE_PTR;
     955           0 :         ep->user = user;
     956             : 
     957           0 :         *pep = ep;
     958             : 
     959           0 :         return 0;
     960             : 
     961             : free_uid:
     962           0 :         free_uid(user);
     963           0 :         return error;
     964             : }
     965             : 
     966             : /*
     967             :  * Search the file inside the eventpoll tree. The RB tree operations
     968             :  * are protected by the "mtx" mutex, and ep_find() must be called with
     969             :  * "mtx" held.
     970             :  */
     971             : static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
     972             : {
     973             :         int kcmp;
     974             :         struct rb_node *rbp;
     975           0 :         struct epitem *epi, *epir = NULL;
     976             :         struct epoll_filefd ffd;
     977             : 
     978           0 :         ep_set_ffd(&ffd, file, fd);
     979           0 :         for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
     980           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
     981           0 :                 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
     982           0 :                 if (kcmp > 0)
     983           0 :                         rbp = rbp->rb_right;
     984           0 :                 else if (kcmp < 0)
     985           0 :                         rbp = rbp->rb_left;
     986             :                 else {
     987             :                         epir = epi;
     988             :                         break;
     989             :                 }
     990             :         }
     991             : 
     992             :         return epir;
     993             : }
     994             : 
     995             : #ifdef CONFIG_KCMP
     996           0 : static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
     997             : {
     998             :         struct rb_node *rbp;
     999             :         struct epitem *epi;
    1000             : 
    1001           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1002           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1003           0 :                 if (epi->ffd.fd == tfd) {
    1004           0 :                         if (toff == 0)
    1005             :                                 return epi;
    1006             :                         else
    1007           0 :                                 toff--;
    1008             :                 }
    1009           0 :                 cond_resched();
    1010             :         }
    1011             : 
    1012             :         return NULL;
    1013             : }
    1014             : 
    1015           0 : struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
    1016             :                                      unsigned long toff)
    1017             : {
    1018             :         struct file *file_raw;
    1019             :         struct eventpoll *ep;
    1020             :         struct epitem *epi;
    1021             : 
    1022           0 :         if (!is_file_epoll(file))
    1023             :                 return ERR_PTR(-EINVAL);
    1024             : 
    1025           0 :         ep = file->private_data;
    1026             : 
    1027           0 :         mutex_lock(&ep->mtx);
    1028           0 :         epi = ep_find_tfd(ep, tfd, toff);
    1029           0 :         if (epi)
    1030           0 :                 file_raw = epi->ffd.file;
    1031             :         else
    1032             :                 file_raw = ERR_PTR(-ENOENT);
    1033           0 :         mutex_unlock(&ep->mtx);
    1034             : 
    1035           0 :         return file_raw;
    1036             : }
    1037             : #endif /* CONFIG_KCMP */
    1038             : 
    1039             : /*
    1040             :  * Adds a new entry to the tail of the list in a lockless way, i.e.
    1041             :  * multiple CPUs are allowed to call this function concurrently.
    1042             :  *
    1043             :  * Beware: it is necessary to prevent any other modifications of the
    1044             :  *         existing list until all changes are completed, in other words
    1045             :  *         concurrent list_add_tail_lockless() calls should be protected
    1046             :  *         with a read lock, where write lock acts as a barrier which
    1047             :  *         makes sure all list_add_tail_lockless() calls are fully
    1048             :  *         completed.
    1049             :  *
    1050             :  *        Also an element can be locklessly added to the list only in one
    1051             :  *        direction i.e. either to the tail or to the head, otherwise
    1052             :  *        concurrent access will corrupt the list.
    1053             :  *
    1054             :  * Return: %false if element has been already added to the list, %true
    1055             :  * otherwise.
    1056             :  */
    1057             : static inline bool list_add_tail_lockless(struct list_head *new,
    1058             :                                           struct list_head *head)
    1059             : {
    1060             :         struct list_head *prev;
    1061             : 
    1062             :         /*
    1063             :          * This is simple 'new->next = head' operation, but cmpxchg()
    1064             :          * is used in order to detect that same element has been just
    1065             :          * added to the list from another CPU: the winner observes
    1066             :          * new->next == new.
    1067             :          */
    1068           0 :         if (cmpxchg(&new->next, new, head) != new)
    1069             :                 return false;
    1070             : 
    1071             :         /*
    1072             :          * Initially ->next of a new element must be updated with the head
    1073             :          * (we are inserting to the tail) and only then pointers are atomically
    1074             :          * exchanged.  XCHG guarantees memory ordering, thus ->next should be
    1075             :          * updated before pointers are actually swapped and pointers are
    1076             :          * swapped before prev->next is updated.
    1077             :          */
    1078             : 
    1079           0 :         prev = xchg(&head->prev, new);
    1080             : 
    1081             :         /*
    1082             :          * It is safe to modify prev->next and new->prev, because a new element
    1083             :          * is added only to the tail and new->next is updated before XCHG.
    1084             :          */
    1085             : 
    1086           0 :         prev->next = new;
    1087           0 :         new->prev = prev;
    1088             : 
    1089             :         return true;
    1090             : }
    1091             : 
    1092             : /*
    1093             :  * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
    1094             :  * i.e. multiple CPUs are allowed to call this function concurrently.
    1095             :  *
    1096             :  * Return: %false if epi element has been already chained, %true otherwise.
    1097             :  */
    1098             : static inline bool chain_epi_lockless(struct epitem *epi)
    1099             : {
    1100           0 :         struct eventpoll *ep = epi->ep;
    1101             : 
    1102             :         /* Fast preliminary check */
    1103           0 :         if (epi->next != EP_UNACTIVE_PTR)
    1104             :                 return false;
    1105             : 
    1106             :         /* Check that the same epi has not been just chained from another CPU */
    1107           0 :         if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
    1108             :                 return false;
    1109             : 
    1110             :         /* Atomically exchange tail */
    1111           0 :         epi->next = xchg(&ep->ovflist, epi);
    1112             : 
    1113             :         return true;
    1114             : }
    1115             : 
    1116             : /*
    1117             :  * This is the callback that is passed to the wait queue wakeup
    1118             :  * mechanism. It is called by the stored file descriptors when they
    1119             :  * have events to report.
    1120             :  *
    1121             :  * This callback takes a read lock in order not to contend with concurrent
    1122             :  * events from another file descriptor, thus all modifications to ->rdllist
    1123             :  * or ->ovflist are lockless.  Read lock is paired with the write lock from
    1124             :  * ep_scan_ready_list(), which stops all list modifications and guarantees
    1125             :  * that lists state is seen correctly.
    1126             :  *
    1127             :  * Another thing worth to mention is that ep_poll_callback() can be called
    1128             :  * concurrently for the same @epi from different CPUs if poll table was inited
    1129             :  * with several wait queues entries.  Plural wakeup from different CPUs of a
    1130             :  * single wait queue is serialized by wq.lock, but the case when multiple wait
    1131             :  * queues are used should be detected accordingly.  This is detected using
    1132             :  * cmpxchg() operation.
    1133             :  */
    1134           0 : static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
    1135             : {
    1136           0 :         int pwake = 0;
    1137           0 :         struct epitem *epi = ep_item_from_wait(wait);
    1138           0 :         struct eventpoll *ep = epi->ep;
    1139           0 :         __poll_t pollflags = key_to_poll(key);
    1140             :         unsigned long flags;
    1141           0 :         int ewake = 0;
    1142             : 
    1143           0 :         read_lock_irqsave(&ep->lock, flags);
    1144             : 
    1145             :         ep_set_busy_poll_napi_id(epi);
    1146             : 
    1147             :         /*
    1148             :          * If the event mask does not contain any poll(2) event, we consider the
    1149             :          * descriptor to be disabled. This condition is likely the effect of the
    1150             :          * EPOLLONESHOT bit that disables the descriptor when an event is received,
    1151             :          * until the next EPOLL_CTL_MOD will be issued.
    1152             :          */
    1153           0 :         if (!(epi->event.events & ~EP_PRIVATE_BITS))
    1154             :                 goto out_unlock;
    1155             : 
    1156             :         /*
    1157             :          * Check the events coming with the callback. At this stage, not
    1158             :          * every device reports the events in the "key" parameter of the
    1159             :          * callback. We need to be able to handle both cases here, hence the
    1160             :          * test for "key" != NULL before the event match test.
    1161             :          */
    1162           0 :         if (pollflags && !(pollflags & epi->event.events))
    1163             :                 goto out_unlock;
    1164             : 
    1165             :         /*
    1166             :          * If we are transferring events to userspace, we can hold no locks
    1167             :          * (because we're accessing user memory, and because of linux f_op->poll()
    1168             :          * semantics). All the events that happen during that period of time are
    1169             :          * chained in ep->ovflist and requeued later on.
    1170             :          */
    1171           0 :         if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
    1172           0 :                 if (chain_epi_lockless(epi))
    1173             :                         ep_pm_stay_awake_rcu(epi);
    1174           0 :         } else if (!ep_is_linked(epi)) {
    1175             :                 /* In the usual case, add event to ready list. */
    1176           0 :                 if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
    1177             :                         ep_pm_stay_awake_rcu(epi);
    1178             :         }
    1179             : 
    1180             :         /*
    1181             :          * Wake up ( if active ) both the eventpoll wait list and the ->poll()
    1182             :          * wait list.
    1183             :          */
    1184           0 :         if (waitqueue_active(&ep->wq)) {
    1185           0 :                 if ((epi->event.events & EPOLLEXCLUSIVE) &&
    1186           0 :                                         !(pollflags & POLLFREE)) {
    1187           0 :                         switch (pollflags & EPOLLINOUT_BITS) {
    1188             :                         case EPOLLIN:
    1189           0 :                                 if (epi->event.events & EPOLLIN)
    1190           0 :                                         ewake = 1;
    1191             :                                 break;
    1192             :                         case EPOLLOUT:
    1193           0 :                                 if (epi->event.events & EPOLLOUT)
    1194           0 :                                         ewake = 1;
    1195             :                                 break;
    1196             :                         case 0:
    1197           0 :                                 ewake = 1;
    1198           0 :                                 break;
    1199             :                         }
    1200             :                 }
    1201           0 :                 wake_up(&ep->wq);
    1202             :         }
    1203           0 :         if (waitqueue_active(&ep->poll_wait))
    1204           0 :                 pwake++;
    1205             : 
    1206             : out_unlock:
    1207           0 :         read_unlock_irqrestore(&ep->lock, flags);
    1208             : 
    1209             :         /* We have to call this outside the lock */
    1210           0 :         if (pwake)
    1211           0 :                 ep_poll_safewake(ep, epi);
    1212             : 
    1213           0 :         if (!(epi->event.events & EPOLLEXCLUSIVE))
    1214           0 :                 ewake = 1;
    1215             : 
    1216           0 :         if (pollflags & POLLFREE) {
    1217             :                 /*
    1218             :                  * If we race with ep_remove_wait_queue() it can miss
    1219             :                  * ->whead = NULL and do another remove_wait_queue() after
    1220             :                  * us, so we can't use __remove_wait_queue().
    1221             :                  */
    1222           0 :                 list_del_init(&wait->entry);
    1223             :                 /*
    1224             :                  * ->whead != NULL protects us from the race with ep_free()
    1225             :                  * or ep_remove(), ep_remove_wait_queue() takes whead->lock
    1226             :                  * held by the caller. Once we nullify it, nothing protects
    1227             :                  * ep/epi or even wait.
    1228             :                  */
    1229           0 :                 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
    1230             :         }
    1231             : 
    1232           0 :         return ewake;
    1233             : }
    1234             : 
    1235             : /*
    1236             :  * This is the callback that is used to add our wait queue to the
    1237             :  * target file wakeup lists.
    1238             :  */
    1239           0 : static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
    1240             :                                  poll_table *pt)
    1241             : {
    1242           0 :         struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
    1243           0 :         struct epitem *epi = epq->epi;
    1244             :         struct eppoll_entry *pwq;
    1245             : 
    1246           0 :         if (unlikely(!epi))     // an earlier allocation has failed
    1247             :                 return;
    1248             : 
    1249           0 :         pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
    1250           0 :         if (unlikely(!pwq)) {
    1251           0 :                 epq->epi = NULL;
    1252           0 :                 return;
    1253             :         }
    1254             : 
    1255           0 :         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
    1256           0 :         pwq->whead = whead;
    1257           0 :         pwq->base = epi;
    1258           0 :         if (epi->event.events & EPOLLEXCLUSIVE)
    1259           0 :                 add_wait_queue_exclusive(whead, &pwq->wait);
    1260             :         else
    1261           0 :                 add_wait_queue(whead, &pwq->wait);
    1262           0 :         pwq->next = epi->pwqlist;
    1263           0 :         epi->pwqlist = pwq;
    1264             : }
    1265             : 
    1266           0 : static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
    1267             : {
    1268             :         int kcmp;
    1269           0 :         struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
    1270             :         struct epitem *epic;
    1271           0 :         bool leftmost = true;
    1272             : 
    1273           0 :         while (*p) {
    1274           0 :                 parent = *p;
    1275           0 :                 epic = rb_entry(parent, struct epitem, rbn);
    1276           0 :                 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
    1277           0 :                 if (kcmp > 0) {
    1278           0 :                         p = &parent->rb_right;
    1279           0 :                         leftmost = false;
    1280             :                 } else
    1281           0 :                         p = &parent->rb_left;
    1282             :         }
    1283           0 :         rb_link_node(&epi->rbn, parent, p);
    1284           0 :         rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
    1285           0 : }
    1286             : 
    1287             : 
    1288             : 
    1289             : #define PATH_ARR_SIZE 5
    1290             : /*
    1291             :  * These are the number paths of length 1 to 5, that we are allowing to emanate
    1292             :  * from a single file of interest. For example, we allow 1000 paths of length
    1293             :  * 1, to emanate from each file of interest. This essentially represents the
    1294             :  * potential wakeup paths, which need to be limited in order to avoid massive
    1295             :  * uncontrolled wakeup storms. The common use case should be a single ep which
    1296             :  * is connected to n file sources. In this case each file source has 1 path
    1297             :  * of length 1. Thus, the numbers below should be more than sufficient. These
    1298             :  * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
    1299             :  * and delete can't add additional paths. Protected by the epmutex.
    1300             :  */
    1301             : static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
    1302             : static int path_count[PATH_ARR_SIZE];
    1303             : 
    1304             : static int path_count_inc(int nests)
    1305             : {
    1306             :         /* Allow an arbitrary number of depth 1 paths */
    1307           0 :         if (nests == 0)
    1308             :                 return 0;
    1309             : 
    1310           0 :         if (++path_count[nests] > path_limits[nests])
    1311             :                 return -1;
    1312             :         return 0;
    1313             : }
    1314             : 
    1315             : static void path_count_init(void)
    1316             : {
    1317             :         int i;
    1318             : 
    1319           0 :         for (i = 0; i < PATH_ARR_SIZE; i++)
    1320           0 :                 path_count[i] = 0;
    1321             : }
    1322             : 
    1323           0 : static int reverse_path_check_proc(struct hlist_head *refs, int depth)
    1324             : {
    1325           0 :         int error = 0;
    1326             :         struct epitem *epi;
    1327             : 
    1328           0 :         if (depth > EP_MAX_NESTS) /* too deep nesting */
    1329             :                 return -1;
    1330             : 
    1331             :         /* CTL_DEL can remove links here, but that can't increase our count */
    1332           0 :         hlist_for_each_entry_rcu(epi, refs, fllink) {
    1333           0 :                 struct hlist_head *refs = &epi->ep->refs;
    1334           0 :                 if (hlist_empty(refs))
    1335             :                         error = path_count_inc(depth);
    1336             :                 else
    1337           0 :                         error = reverse_path_check_proc(refs, depth + 1);
    1338           0 :                 if (error != 0)
    1339             :                         break;
    1340             :         }
    1341             :         return error;
    1342             : }
    1343             : 
    1344             : /**
    1345             :  * reverse_path_check - The tfile_check_list is list of epitem_head, which have
    1346             :  *                      links that are proposed to be newly added. We need to
    1347             :  *                      make sure that those added links don't add too many
    1348             :  *                      paths such that we will spend all our time waking up
    1349             :  *                      eventpoll objects.
    1350             :  *
    1351             :  * Return: %zero if the proposed links don't create too many paths,
    1352             :  *          %-1 otherwise.
    1353             :  */
    1354           0 : static int reverse_path_check(void)
    1355             : {
    1356             :         struct epitems_head *p;
    1357             : 
    1358           0 :         for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
    1359             :                 int error;
    1360           0 :                 path_count_init();
    1361             :                 rcu_read_lock();
    1362           0 :                 error = reverse_path_check_proc(&p->epitems, 0);
    1363             :                 rcu_read_unlock();
    1364           0 :                 if (error)
    1365             :                         return error;
    1366             :         }
    1367             :         return 0;
    1368             : }
    1369             : 
    1370           0 : static int ep_create_wakeup_source(struct epitem *epi)
    1371             : {
    1372             :         struct name_snapshot n;
    1373             :         struct wakeup_source *ws;
    1374             : 
    1375           0 :         if (!epi->ep->ws) {
    1376           0 :                 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
    1377           0 :                 if (!epi->ep->ws)
    1378             :                         return -ENOMEM;
    1379             :         }
    1380             : 
    1381           0 :         take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
    1382           0 :         ws = wakeup_source_register(NULL, n.name.name);
    1383           0 :         release_dentry_name_snapshot(&n);
    1384             : 
    1385           0 :         if (!ws)
    1386             :                 return -ENOMEM;
    1387           0 :         rcu_assign_pointer(epi->ws, ws);
    1388             : 
    1389           0 :         return 0;
    1390             : }
    1391             : 
    1392             : /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
    1393           0 : static noinline void ep_destroy_wakeup_source(struct epitem *epi)
    1394             : {
    1395           0 :         struct wakeup_source *ws = ep_wakeup_source(epi);
    1396             : 
    1397           0 :         RCU_INIT_POINTER(epi->ws, NULL);
    1398             : 
    1399             :         /*
    1400             :          * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
    1401             :          * used internally by wakeup_source_remove, too (called by
    1402             :          * wakeup_source_unregister), so we cannot use call_rcu
    1403             :          */
    1404           0 :         synchronize_rcu();
    1405           0 :         wakeup_source_unregister(ws);
    1406           0 : }
    1407             : 
    1408           0 : static int attach_epitem(struct file *file, struct epitem *epi)
    1409             : {
    1410           0 :         struct epitems_head *to_free = NULL;
    1411           0 :         struct hlist_head *head = NULL;
    1412           0 :         struct eventpoll *ep = NULL;
    1413             : 
    1414           0 :         if (is_file_epoll(file))
    1415           0 :                 ep = file->private_data;
    1416             : 
    1417           0 :         if (ep) {
    1418           0 :                 head = &ep->refs;
    1419           0 :         } else if (!READ_ONCE(file->f_ep)) {
    1420             : allocate:
    1421           0 :                 to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
    1422           0 :                 if (!to_free)
    1423             :                         return -ENOMEM;
    1424           0 :                 head = &to_free->epitems;
    1425             :         }
    1426           0 :         spin_lock(&file->f_lock);
    1427           0 :         if (!file->f_ep) {
    1428           0 :                 if (unlikely(!head)) {
    1429           0 :                         spin_unlock(&file->f_lock);
    1430             :                         goto allocate;
    1431             :                 }
    1432           0 :                 file->f_ep = head;
    1433           0 :                 to_free = NULL;
    1434             :         }
    1435           0 :         hlist_add_head_rcu(&epi->fllink, file->f_ep);
    1436           0 :         spin_unlock(&file->f_lock);
    1437             :         free_ephead(to_free);
    1438             :         return 0;
    1439             : }
    1440             : 
    1441             : /*
    1442             :  * Must be called with "mtx" held.
    1443             :  */
    1444           0 : static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
    1445             :                      struct file *tfile, int fd, int full_check)
    1446             : {
    1447           0 :         int error, pwake = 0;
    1448             :         __poll_t revents;
    1449             :         struct epitem *epi;
    1450             :         struct ep_pqueue epq;
    1451           0 :         struct eventpoll *tep = NULL;
    1452             : 
    1453           0 :         if (is_file_epoll(tfile))
    1454           0 :                 tep = tfile->private_data;
    1455             : 
    1456             :         lockdep_assert_irqs_enabled();
    1457             : 
    1458           0 :         if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
    1459             :                                             max_user_watches) >= 0))
    1460             :                 return -ENOSPC;
    1461           0 :         percpu_counter_inc(&ep->user->epoll_watches);
    1462             : 
    1463           0 :         if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
    1464           0 :                 percpu_counter_dec(&ep->user->epoll_watches);
    1465           0 :                 return -ENOMEM;
    1466             :         }
    1467             : 
    1468             :         /* Item initialization follow here ... */
    1469           0 :         INIT_LIST_HEAD(&epi->rdllink);
    1470           0 :         epi->ep = ep;
    1471           0 :         ep_set_ffd(&epi->ffd, tfile, fd);
    1472           0 :         epi->event = *event;
    1473           0 :         epi->next = EP_UNACTIVE_PTR;
    1474             : 
    1475           0 :         if (tep)
    1476           0 :                 mutex_lock_nested(&tep->mtx, 1);
    1477             :         /* Add the current item to the list of active epoll hook for this file */
    1478           0 :         if (unlikely(attach_epitem(tfile, epi) < 0)) {
    1479           0 :                 if (tep)
    1480           0 :                         mutex_unlock(&tep->mtx);
    1481           0 :                 kmem_cache_free(epi_cache, epi);
    1482           0 :                 percpu_counter_dec(&ep->user->epoll_watches);
    1483           0 :                 return -ENOMEM;
    1484             :         }
    1485             : 
    1486           0 :         if (full_check && !tep)
    1487           0 :                 list_file(tfile);
    1488             : 
    1489             :         /*
    1490             :          * Add the current item to the RB tree. All RB tree operations are
    1491             :          * protected by "mtx", and ep_insert() is called with "mtx" held.
    1492             :          */
    1493           0 :         ep_rbtree_insert(ep, epi);
    1494           0 :         if (tep)
    1495           0 :                 mutex_unlock(&tep->mtx);
    1496             : 
    1497             :         /* now check if we've created too many backpaths */
    1498           0 :         if (unlikely(full_check && reverse_path_check())) {
    1499           0 :                 ep_remove(ep, epi);
    1500           0 :                 return -EINVAL;
    1501             :         }
    1502             : 
    1503           0 :         if (epi->event.events & EPOLLWAKEUP) {
    1504           0 :                 error = ep_create_wakeup_source(epi);
    1505           0 :                 if (error) {
    1506           0 :                         ep_remove(ep, epi);
    1507           0 :                         return error;
    1508             :                 }
    1509             :         }
    1510             : 
    1511             :         /* Initialize the poll table using the queue callback */
    1512           0 :         epq.epi = epi;
    1513           0 :         init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    1514             : 
    1515             :         /*
    1516             :          * Attach the item to the poll hooks and get current event bits.
    1517             :          * We can safely use the file* here because its usage count has
    1518             :          * been increased by the caller of this function. Note that after
    1519             :          * this operation completes, the poll callback can start hitting
    1520             :          * the new item.
    1521             :          */
    1522           0 :         revents = ep_item_poll(epi, &epq.pt, 1);
    1523             : 
    1524             :         /*
    1525             :          * We have to check if something went wrong during the poll wait queue
    1526             :          * install process. Namely an allocation for a wait queue failed due
    1527             :          * high memory pressure.
    1528             :          */
    1529           0 :         if (unlikely(!epq.epi)) {
    1530           0 :                 ep_remove(ep, epi);
    1531           0 :                 return -ENOMEM;
    1532             :         }
    1533             : 
    1534             :         /* We have to drop the new item inside our item list to keep track of it */
    1535           0 :         write_lock_irq(&ep->lock);
    1536             : 
    1537             :         /* record NAPI ID of new item if present */
    1538             :         ep_set_busy_poll_napi_id(epi);
    1539             : 
    1540             :         /* If the file is already "ready" we drop it inside the ready list */
    1541           0 :         if (revents && !ep_is_linked(epi)) {
    1542           0 :                 list_add_tail(&epi->rdllink, &ep->rdllist);
    1543           0 :                 ep_pm_stay_awake(epi);
    1544             : 
    1545             :                 /* Notify waiting tasks that events are available */
    1546           0 :                 if (waitqueue_active(&ep->wq))
    1547           0 :                         wake_up(&ep->wq);
    1548           0 :                 if (waitqueue_active(&ep->poll_wait))
    1549           0 :                         pwake++;
    1550             :         }
    1551             : 
    1552           0 :         write_unlock_irq(&ep->lock);
    1553             : 
    1554             :         /* We have to call this outside the lock */
    1555           0 :         if (pwake)
    1556           0 :                 ep_poll_safewake(ep, NULL);
    1557             : 
    1558             :         return 0;
    1559             : }
    1560             : 
    1561             : /*
    1562             :  * Modify the interest event mask by dropping an event if the new mask
    1563             :  * has a match in the current file status. Must be called with "mtx" held.
    1564             :  */
    1565           0 : static int ep_modify(struct eventpoll *ep, struct epitem *epi,
    1566             :                      const struct epoll_event *event)
    1567             : {
    1568           0 :         int pwake = 0;
    1569             :         poll_table pt;
    1570             : 
    1571             :         lockdep_assert_irqs_enabled();
    1572             : 
    1573           0 :         init_poll_funcptr(&pt, NULL);
    1574             : 
    1575             :         /*
    1576             :          * Set the new event interest mask before calling f_op->poll();
    1577             :          * otherwise we might miss an event that happens between the
    1578             :          * f_op->poll() call and the new event set registering.
    1579             :          */
    1580           0 :         epi->event.events = event->events; /* need barrier below */
    1581           0 :         epi->event.data = event->data; /* protected by mtx */
    1582           0 :         if (epi->event.events & EPOLLWAKEUP) {
    1583           0 :                 if (!ep_has_wakeup_source(epi))
    1584           0 :                         ep_create_wakeup_source(epi);
    1585           0 :         } else if (ep_has_wakeup_source(epi)) {
    1586           0 :                 ep_destroy_wakeup_source(epi);
    1587             :         }
    1588             : 
    1589             :         /*
    1590             :          * The following barrier has two effects:
    1591             :          *
    1592             :          * 1) Flush epi changes above to other CPUs.  This ensures
    1593             :          *    we do not miss events from ep_poll_callback if an
    1594             :          *    event occurs immediately after we call f_op->poll().
    1595             :          *    We need this because we did not take ep->lock while
    1596             :          *    changing epi above (but ep_poll_callback does take
    1597             :          *    ep->lock).
    1598             :          *
    1599             :          * 2) We also need to ensure we do not miss _past_ events
    1600             :          *    when calling f_op->poll().  This barrier also
    1601             :          *    pairs with the barrier in wq_has_sleeper (see
    1602             :          *    comments for wq_has_sleeper).
    1603             :          *
    1604             :          * This barrier will now guarantee ep_poll_callback or f_op->poll
    1605             :          * (or both) will notice the readiness of an item.
    1606             :          */
    1607           0 :         smp_mb();
    1608             : 
    1609             :         /*
    1610             :          * Get current event bits. We can safely use the file* here because
    1611             :          * its usage count has been increased by the caller of this function.
    1612             :          * If the item is "hot" and it is not registered inside the ready
    1613             :          * list, push it inside.
    1614             :          */
    1615           0 :         if (ep_item_poll(epi, &pt, 1)) {
    1616           0 :                 write_lock_irq(&ep->lock);
    1617           0 :                 if (!ep_is_linked(epi)) {
    1618           0 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1619           0 :                         ep_pm_stay_awake(epi);
    1620             : 
    1621             :                         /* Notify waiting tasks that events are available */
    1622           0 :                         if (waitqueue_active(&ep->wq))
    1623           0 :                                 wake_up(&ep->wq);
    1624           0 :                         if (waitqueue_active(&ep->poll_wait))
    1625           0 :                                 pwake++;
    1626             :                 }
    1627           0 :                 write_unlock_irq(&ep->lock);
    1628             :         }
    1629             : 
    1630             :         /* We have to call this outside the lock */
    1631           0 :         if (pwake)
    1632           0 :                 ep_poll_safewake(ep, NULL);
    1633             : 
    1634           0 :         return 0;
    1635             : }
    1636             : 
    1637           0 : static int ep_send_events(struct eventpoll *ep,
    1638             :                           struct epoll_event __user *events, int maxevents)
    1639             : {
    1640             :         struct epitem *epi, *tmp;
    1641           0 :         LIST_HEAD(txlist);
    1642             :         poll_table pt;
    1643           0 :         int res = 0;
    1644             : 
    1645             :         /*
    1646             :          * Always short-circuit for fatal signals to allow threads to make a
    1647             :          * timely exit without the chance of finding more events available and
    1648             :          * fetching repeatedly.
    1649             :          */
    1650           0 :         if (fatal_signal_pending(current))
    1651             :                 return -EINTR;
    1652             : 
    1653           0 :         init_poll_funcptr(&pt, NULL);
    1654             : 
    1655           0 :         mutex_lock(&ep->mtx);
    1656           0 :         ep_start_scan(ep, &txlist);
    1657             : 
    1658             :         /*
    1659             :          * We can loop without lock because we are passed a task private list.
    1660             :          * Items cannot vanish during the loop we are holding ep->mtx.
    1661             :          */
    1662           0 :         list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
    1663             :                 struct wakeup_source *ws;
    1664             :                 __poll_t revents;
    1665             : 
    1666           0 :                 if (res >= maxevents)
    1667             :                         break;
    1668             : 
    1669             :                 /*
    1670             :                  * Activate ep->ws before deactivating epi->ws to prevent
    1671             :                  * triggering auto-suspend here (in case we reactive epi->ws
    1672             :                  * below).
    1673             :                  *
    1674             :                  * This could be rearranged to delay the deactivation of epi->ws
    1675             :                  * instead, but then epi->ws would temporarily be out of sync
    1676             :                  * with ep_is_linked().
    1677             :                  */
    1678           0 :                 ws = ep_wakeup_source(epi);
    1679           0 :                 if (ws) {
    1680           0 :                         if (ws->active)
    1681           0 :                                 __pm_stay_awake(ep->ws);
    1682           0 :                         __pm_relax(ws);
    1683             :                 }
    1684             : 
    1685           0 :                 list_del_init(&epi->rdllink);
    1686             : 
    1687             :                 /*
    1688             :                  * If the event mask intersect the caller-requested one,
    1689             :                  * deliver the event to userspace. Again, we are holding ep->mtx,
    1690             :                  * so no operations coming from userspace can change the item.
    1691             :                  */
    1692           0 :                 revents = ep_item_poll(epi, &pt, 1);
    1693           0 :                 if (!revents)
    1694           0 :                         continue;
    1695             : 
    1696           0 :                 events = epoll_put_uevent(revents, epi->event.data, events);
    1697           0 :                 if (!events) {
    1698           0 :                         list_add(&epi->rdllink, &txlist);
    1699           0 :                         ep_pm_stay_awake(epi);
    1700           0 :                         if (!res)
    1701           0 :                                 res = -EFAULT;
    1702             :                         break;
    1703             :                 }
    1704           0 :                 res++;
    1705           0 :                 if (epi->event.events & EPOLLONESHOT)
    1706           0 :                         epi->event.events &= EP_PRIVATE_BITS;
    1707           0 :                 else if (!(epi->event.events & EPOLLET)) {
    1708             :                         /*
    1709             :                          * If this file has been added with Level
    1710             :                          * Trigger mode, we need to insert back inside
    1711             :                          * the ready list, so that the next call to
    1712             :                          * epoll_wait() will check again the events
    1713             :                          * availability. At this point, no one can insert
    1714             :                          * into ep->rdllist besides us. The epoll_ctl()
    1715             :                          * callers are locked out by
    1716             :                          * ep_scan_ready_list() holding "mtx" and the
    1717             :                          * poll callback will queue them in ep->ovflist.
    1718             :                          */
    1719           0 :                         list_add_tail(&epi->rdllink, &ep->rdllist);
    1720             :                         ep_pm_stay_awake(epi);
    1721             :                 }
    1722             :         }
    1723           0 :         ep_done_scan(ep, &txlist);
    1724           0 :         mutex_unlock(&ep->mtx);
    1725             : 
    1726           0 :         return res;
    1727             : }
    1728             : 
    1729           0 : static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
    1730             : {
    1731             :         struct timespec64 now;
    1732             : 
    1733           0 :         if (ms < 0)
    1734             :                 return NULL;
    1735             : 
    1736           0 :         if (!ms) {
    1737           0 :                 to->tv_sec = 0;
    1738           0 :                 to->tv_nsec = 0;
    1739           0 :                 return to;
    1740             :         }
    1741             : 
    1742           0 :         to->tv_sec = ms / MSEC_PER_SEC;
    1743           0 :         to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
    1744             : 
    1745           0 :         ktime_get_ts64(&now);
    1746           0 :         *to = timespec64_add_safe(now, *to);
    1747           0 :         return to;
    1748             : }
    1749             : 
    1750             : /**
    1751             :  * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
    1752             :  *           event buffer.
    1753             :  *
    1754             :  * @ep: Pointer to the eventpoll context.
    1755             :  * @events: Pointer to the userspace buffer where the ready events should be
    1756             :  *          stored.
    1757             :  * @maxevents: Size (in terms of number of events) of the caller event buffer.
    1758             :  * @timeout: Maximum timeout for the ready events fetch operation, in
    1759             :  *           timespec. If the timeout is zero, the function will not block,
    1760             :  *           while if the @timeout ptr is NULL, the function will block
    1761             :  *           until at least one event has been retrieved (or an error
    1762             :  *           occurred).
    1763             :  *
    1764             :  * Return: the number of ready events which have been fetched, or an
    1765             :  *          error code, in case of error.
    1766             :  */
    1767           0 : static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
    1768             :                    int maxevents, struct timespec64 *timeout)
    1769             : {
    1770           0 :         int res, eavail, timed_out = 0;
    1771           0 :         u64 slack = 0;
    1772             :         wait_queue_entry_t wait;
    1773           0 :         ktime_t expires, *to = NULL;
    1774             : 
    1775             :         lockdep_assert_irqs_enabled();
    1776             : 
    1777           0 :         if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
    1778           0 :                 slack = select_estimate_accuracy(timeout);
    1779           0 :                 to = &expires;
    1780           0 :                 *to = timespec64_to_ktime(*timeout);
    1781           0 :         } else if (timeout) {
    1782             :                 /*
    1783             :                  * Avoid the unnecessary trip to the wait queue loop, if the
    1784             :                  * caller specified a non blocking operation.
    1785             :                  */
    1786           0 :                 timed_out = 1;
    1787             :         }
    1788             : 
    1789             :         /*
    1790             :          * This call is racy: We may or may not see events that are being added
    1791             :          * to the ready list under the lock (e.g., in IRQ callbacks). For cases
    1792             :          * with a non-zero timeout, this thread will check the ready list under
    1793             :          * lock and will add to the wait queue.  For cases with a zero
    1794             :          * timeout, the user by definition should not care and will have to
    1795             :          * recheck again.
    1796             :          */
    1797             :         eavail = ep_events_available(ep);
    1798             : 
    1799             :         while (1) {
    1800           0 :                 if (eavail) {
    1801             :                         /*
    1802             :                          * Try to transfer events to user space. In case we get
    1803             :                          * 0 events and there's still timeout left over, we go
    1804             :                          * trying again in search of more luck.
    1805             :                          */
    1806           0 :                         res = ep_send_events(ep, events, maxevents);
    1807           0 :                         if (res)
    1808             :                                 return res;
    1809             :                 }
    1810             : 
    1811           0 :                 if (timed_out)
    1812             :                         return 0;
    1813             : 
    1814           0 :                 eavail = ep_busy_loop(ep, timed_out);
    1815             :                 if (eavail)
    1816             :                         continue;
    1817             : 
    1818           0 :                 if (signal_pending(current))
    1819             :                         return -EINTR;
    1820             : 
    1821             :                 /*
    1822             :                  * Internally init_wait() uses autoremove_wake_function(),
    1823             :                  * thus wait entry is removed from the wait queue on each
    1824             :                  * wakeup. Why it is important? In case of several waiters
    1825             :                  * each new wakeup will hit the next waiter, giving it the
    1826             :                  * chance to harvest new event. Otherwise wakeup can be
    1827             :                  * lost. This is also good performance-wise, because on
    1828             :                  * normal wakeup path no need to call __remove_wait_queue()
    1829             :                  * explicitly, thus ep->lock is not taken, which halts the
    1830             :                  * event delivery.
    1831             :                  */
    1832           0 :                 init_wait(&wait);
    1833             : 
    1834           0 :                 write_lock_irq(&ep->lock);
    1835             :                 /*
    1836             :                  * Barrierless variant, waitqueue_active() is called under
    1837             :                  * the same lock on wakeup ep_poll_callback() side, so it
    1838             :                  * is safe to avoid an explicit barrier.
    1839             :                  */
    1840           0 :                 __set_current_state(TASK_INTERRUPTIBLE);
    1841             : 
    1842             :                 /*
    1843             :                  * Do the final check under the lock. ep_scan_ready_list()
    1844             :                  * plays with two lists (->rdllist and ->ovflist) and there
    1845             :                  * is always a race when both lists are empty for short
    1846             :                  * period of time although events are pending, so lock is
    1847             :                  * important.
    1848             :                  */
    1849           0 :                 eavail = ep_events_available(ep);
    1850           0 :                 if (!eavail)
    1851           0 :                         __add_wait_queue_exclusive(&ep->wq, &wait);
    1852             : 
    1853           0 :                 write_unlock_irq(&ep->lock);
    1854             : 
    1855           0 :                 if (!eavail)
    1856           0 :                         timed_out = !schedule_hrtimeout_range(to, slack,
    1857             :                                                               HRTIMER_MODE_ABS);
    1858           0 :                 __set_current_state(TASK_RUNNING);
    1859             : 
    1860             :                 /*
    1861             :                  * We were woken up, thus go and try to harvest some events.
    1862             :                  * If timed out and still on the wait queue, recheck eavail
    1863             :                  * carefully under lock, below.
    1864             :                  */
    1865           0 :                 eavail = 1;
    1866             : 
    1867           0 :                 if (!list_empty_careful(&wait.entry)) {
    1868           0 :                         write_lock_irq(&ep->lock);
    1869             :                         /*
    1870             :                          * If the thread timed out and is not on the wait queue,
    1871             :                          * it means that the thread was woken up after its
    1872             :                          * timeout expired before it could reacquire the lock.
    1873             :                          * Thus, when wait.entry is empty, it needs to harvest
    1874             :                          * events.
    1875             :                          */
    1876           0 :                         if (timed_out)
    1877           0 :                                 eavail = list_empty(&wait.entry);
    1878           0 :                         __remove_wait_queue(&ep->wq, &wait);
    1879           0 :                         write_unlock_irq(&ep->lock);
    1880             :                 }
    1881             :         }
    1882             : }
    1883             : 
    1884             : /**
    1885             :  * ep_loop_check_proc - verify that adding an epoll file inside another
    1886             :  *                      epoll structure does not violate the constraints, in
    1887             :  *                      terms of closed loops, or too deep chains (which can
    1888             :  *                      result in excessive stack usage).
    1889             :  *
    1890             :  * @ep: the &struct eventpoll to be currently checked.
    1891             :  * @depth: Current depth of the path being checked.
    1892             :  *
    1893             :  * Return: %zero if adding the epoll @file inside current epoll
    1894             :  *          structure @ep does not violate the constraints, or %-1 otherwise.
    1895             :  */
    1896           0 : static int ep_loop_check_proc(struct eventpoll *ep, int depth)
    1897             : {
    1898           0 :         int error = 0;
    1899             :         struct rb_node *rbp;
    1900             :         struct epitem *epi;
    1901             : 
    1902           0 :         mutex_lock_nested(&ep->mtx, depth + 1);
    1903           0 :         ep->gen = loop_check_gen;
    1904           0 :         for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
    1905           0 :                 epi = rb_entry(rbp, struct epitem, rbn);
    1906           0 :                 if (unlikely(is_file_epoll(epi->ffd.file))) {
    1907             :                         struct eventpoll *ep_tovisit;
    1908           0 :                         ep_tovisit = epi->ffd.file->private_data;
    1909           0 :                         if (ep_tovisit->gen == loop_check_gen)
    1910           0 :                                 continue;
    1911           0 :                         if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
    1912             :                                 error = -1;
    1913             :                         else
    1914           0 :                                 error = ep_loop_check_proc(ep_tovisit, depth + 1);
    1915           0 :                         if (error != 0)
    1916             :                                 break;
    1917             :                 } else {
    1918             :                         /*
    1919             :                          * If we've reached a file that is not associated with
    1920             :                          * an ep, then we need to check if the newly added
    1921             :                          * links are going to add too many wakeup paths. We do
    1922             :                          * this by adding it to the tfile_check_list, if it's
    1923             :                          * not already there, and calling reverse_path_check()
    1924             :                          * during ep_insert().
    1925             :                          */
    1926           0 :                         list_file(epi->ffd.file);
    1927             :                 }
    1928             :         }
    1929           0 :         mutex_unlock(&ep->mtx);
    1930             : 
    1931           0 :         return error;
    1932             : }
    1933             : 
    1934             : /**
    1935             :  * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
    1936             :  *                 into another epoll file (represented by @ep) does not create
    1937             :  *                 closed loops or too deep chains.
    1938             :  *
    1939             :  * @ep: Pointer to the epoll we are inserting into.
    1940             :  * @to: Pointer to the epoll to be inserted.
    1941             :  *
    1942             :  * Return: %zero if adding the epoll @to inside the epoll @from
    1943             :  * does not violate the constraints, or %-1 otherwise.
    1944             :  */
    1945             : static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
    1946             : {
    1947           0 :         inserting_into = ep;
    1948           0 :         return ep_loop_check_proc(to, 0);
    1949             : }
    1950             : 
    1951             : static void clear_tfile_check_list(void)
    1952             : {
    1953             :         rcu_read_lock();
    1954           0 :         while (tfile_check_list != EP_UNACTIVE_PTR) {
    1955           0 :                 struct epitems_head *head = tfile_check_list;
    1956           0 :                 tfile_check_list = head->next;
    1957           0 :                 unlist_file(head);
    1958             :         }
    1959             :         rcu_read_unlock();
    1960             : }
    1961             : 
    1962             : /*
    1963             :  * Open an eventpoll file descriptor.
    1964             :  */
    1965           0 : static int do_epoll_create(int flags)
    1966             : {
    1967             :         int error, fd;
    1968           0 :         struct eventpoll *ep = NULL;
    1969             :         struct file *file;
    1970             : 
    1971             :         /* Check the EPOLL_* constant for consistency.  */
    1972             :         BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    1973             : 
    1974           0 :         if (flags & ~EPOLL_CLOEXEC)
    1975             :                 return -EINVAL;
    1976             :         /*
    1977             :          * Create the internal data structure ("struct eventpoll").
    1978             :          */
    1979           0 :         error = ep_alloc(&ep);
    1980           0 :         if (error < 0)
    1981             :                 return error;
    1982             :         /*
    1983             :          * Creates all the items needed to setup an eventpoll file. That is,
    1984             :          * a file structure and a free file descriptor.
    1985             :          */
    1986           0 :         fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    1987           0 :         if (fd < 0) {
    1988             :                 error = fd;
    1989             :                 goto out_free_ep;
    1990             :         }
    1991           0 :         file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
    1992             :                                  O_RDWR | (flags & O_CLOEXEC));
    1993           0 :         if (IS_ERR(file)) {
    1994           0 :                 error = PTR_ERR(file);
    1995             :                 goto out_free_fd;
    1996             :         }
    1997           0 :         ep->file = file;
    1998           0 :         fd_install(fd, file);
    1999           0 :         return fd;
    2000             : 
    2001             : out_free_fd:
    2002           0 :         put_unused_fd(fd);
    2003             : out_free_ep:
    2004           0 :         ep_free(ep);
    2005           0 :         return error;
    2006             : }
    2007             : 
    2008           0 : SYSCALL_DEFINE1(epoll_create1, int, flags)
    2009             : {
    2010           0 :         return do_epoll_create(flags);
    2011             : }
    2012             : 
    2013           0 : SYSCALL_DEFINE1(epoll_create, int, size)
    2014             : {
    2015           0 :         if (size <= 0)
    2016             :                 return -EINVAL;
    2017             : 
    2018           0 :         return do_epoll_create(0);
    2019             : }
    2020             : 
    2021             : static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
    2022             :                                    bool nonblock)
    2023             : {
    2024           0 :         if (!nonblock) {
    2025           0 :                 mutex_lock_nested(mutex, depth);
    2026             :                 return 0;
    2027             :         }
    2028           0 :         if (mutex_trylock(mutex))
    2029             :                 return 0;
    2030             :         return -EAGAIN;
    2031             : }
    2032             : 
    2033           0 : int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
    2034             :                  bool nonblock)
    2035             : {
    2036             :         int error;
    2037           0 :         int full_check = 0;
    2038             :         struct fd f, tf;
    2039             :         struct eventpoll *ep;
    2040             :         struct epitem *epi;
    2041           0 :         struct eventpoll *tep = NULL;
    2042             : 
    2043           0 :         error = -EBADF;
    2044           0 :         f = fdget(epfd);
    2045           0 :         if (!f.file)
    2046             :                 goto error_return;
    2047             : 
    2048             :         /* Get the "struct file *" for the target file */
    2049           0 :         tf = fdget(fd);
    2050           0 :         if (!tf.file)
    2051             :                 goto error_fput;
    2052             : 
    2053             :         /* The target file descriptor must support poll */
    2054           0 :         error = -EPERM;
    2055           0 :         if (!file_can_poll(tf.file))
    2056             :                 goto error_tgt_fput;
    2057             : 
    2058             :         /* Check if EPOLLWAKEUP is allowed */
    2059           0 :         if (ep_op_has_event(op))
    2060           0 :                 ep_take_care_of_epollwakeup(epds);
    2061             : 
    2062             :         /*
    2063             :          * We have to check that the file structure underneath the file descriptor
    2064             :          * the user passed to us _is_ an eventpoll file. And also we do not permit
    2065             :          * adding an epoll file descriptor inside itself.
    2066             :          */
    2067           0 :         error = -EINVAL;
    2068           0 :         if (f.file == tf.file || !is_file_epoll(f.file))
    2069             :                 goto error_tgt_fput;
    2070             : 
    2071             :         /*
    2072             :          * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
    2073             :          * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
    2074             :          * Also, we do not currently supported nested exclusive wakeups.
    2075             :          */
    2076           0 :         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
    2077           0 :                 if (op == EPOLL_CTL_MOD)
    2078             :                         goto error_tgt_fput;
    2079           0 :                 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
    2080           0 :                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
    2081             :                         goto error_tgt_fput;
    2082             :         }
    2083             : 
    2084             :         /*
    2085             :          * At this point it is safe to assume that the "private_data" contains
    2086             :          * our own data structure.
    2087             :          */
    2088           0 :         ep = f.file->private_data;
    2089             : 
    2090             :         /*
    2091             :          * When we insert an epoll file descriptor inside another epoll file
    2092             :          * descriptor, there is the chance of creating closed loops, which are
    2093             :          * better be handled here, than in more critical paths. While we are
    2094             :          * checking for loops we also determine the list of files reachable
    2095             :          * and hang them on the tfile_check_list, so we can check that we
    2096             :          * haven't created too many possible wakeup paths.
    2097             :          *
    2098             :          * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
    2099             :          * the epoll file descriptor is attaching directly to a wakeup source,
    2100             :          * unless the epoll file descriptor is nested. The purpose of taking the
    2101             :          * 'epmutex' on add is to prevent complex toplogies such as loops and
    2102             :          * deep wakeup paths from forming in parallel through multiple
    2103             :          * EPOLL_CTL_ADD operations.
    2104             :          */
    2105           0 :         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2106           0 :         if (error)
    2107             :                 goto error_tgt_fput;
    2108           0 :         if (op == EPOLL_CTL_ADD) {
    2109           0 :                 if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
    2110           0 :                     is_file_epoll(tf.file)) {
    2111           0 :                         mutex_unlock(&ep->mtx);
    2112           0 :                         error = epoll_mutex_lock(&epmutex, 0, nonblock);
    2113           0 :                         if (error)
    2114             :                                 goto error_tgt_fput;
    2115           0 :                         loop_check_gen++;
    2116           0 :                         full_check = 1;
    2117           0 :                         if (is_file_epoll(tf.file)) {
    2118           0 :                                 tep = tf.file->private_data;
    2119           0 :                                 error = -ELOOP;
    2120           0 :                                 if (ep_loop_check(ep, tep) != 0)
    2121             :                                         goto error_tgt_fput;
    2122             :                         }
    2123           0 :                         error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
    2124           0 :                         if (error)
    2125             :                                 goto error_tgt_fput;
    2126             :                 }
    2127             :         }
    2128             : 
    2129             :         /*
    2130             :          * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
    2131             :          * above, we can be sure to be able to use the item looked up by
    2132             :          * ep_find() till we release the mutex.
    2133             :          */
    2134           0 :         epi = ep_find(ep, tf.file, fd);
    2135             : 
    2136           0 :         error = -EINVAL;
    2137           0 :         switch (op) {
    2138             :         case EPOLL_CTL_ADD:
    2139           0 :                 if (!epi) {
    2140           0 :                         epds->events |= EPOLLERR | EPOLLHUP;
    2141           0 :                         error = ep_insert(ep, epds, tf.file, fd, full_check);
    2142             :                 } else
    2143             :                         error = -EEXIST;
    2144             :                 break;
    2145             :         case EPOLL_CTL_DEL:
    2146           0 :                 if (epi)
    2147           0 :                         error = ep_remove(ep, epi);
    2148             :                 else
    2149             :                         error = -ENOENT;
    2150             :                 break;
    2151             :         case EPOLL_CTL_MOD:
    2152           0 :                 if (epi) {
    2153           0 :                         if (!(epi->event.events & EPOLLEXCLUSIVE)) {
    2154           0 :                                 epds->events |= EPOLLERR | EPOLLHUP;
    2155           0 :                                 error = ep_modify(ep, epi, epds);
    2156             :                         }
    2157             :                 } else
    2158             :                         error = -ENOENT;
    2159             :                 break;
    2160             :         }
    2161           0 :         mutex_unlock(&ep->mtx);
    2162             : 
    2163             : error_tgt_fput:
    2164           0 :         if (full_check) {
    2165             :                 clear_tfile_check_list();
    2166           0 :                 loop_check_gen++;
    2167           0 :                 mutex_unlock(&epmutex);
    2168             :         }
    2169             : 
    2170           0 :         fdput(tf);
    2171             : error_fput:
    2172           0 :         fdput(f);
    2173             : error_return:
    2174             : 
    2175           0 :         return error;
    2176             : }
    2177             : 
    2178             : /*
    2179             :  * The following function implements the controller interface for
    2180             :  * the eventpoll file that enables the insertion/removal/change of
    2181             :  * file descriptors inside the interest set.
    2182             :  */
    2183           0 : SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
    2184             :                 struct epoll_event __user *, event)
    2185             : {
    2186             :         struct epoll_event epds;
    2187             : 
    2188           0 :         if (ep_op_has_event(op) &&
    2189           0 :             copy_from_user(&epds, event, sizeof(struct epoll_event)))
    2190             :                 return -EFAULT;
    2191             : 
    2192           0 :         return do_epoll_ctl(epfd, op, fd, &epds, false);
    2193             : }
    2194             : 
    2195             : /*
    2196             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2197             :  * part of the user space epoll_wait(2).
    2198             :  */
    2199           0 : static int do_epoll_wait(int epfd, struct epoll_event __user *events,
    2200             :                          int maxevents, struct timespec64 *to)
    2201             : {
    2202             :         int error;
    2203             :         struct fd f;
    2204             :         struct eventpoll *ep;
    2205             : 
    2206             :         /* The maximum number of event must be greater than zero */
    2207           0 :         if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
    2208             :                 return -EINVAL;
    2209             : 
    2210             :         /* Verify that the area passed by the user is writeable */
    2211           0 :         if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
    2212             :                 return -EFAULT;
    2213             : 
    2214             :         /* Get the "struct file *" for the eventpoll file */
    2215           0 :         f = fdget(epfd);
    2216           0 :         if (!f.file)
    2217             :                 return -EBADF;
    2218             : 
    2219             :         /*
    2220             :          * We have to check that the file structure underneath the fd
    2221             :          * the user passed to us _is_ an eventpoll file.
    2222             :          */
    2223           0 :         error = -EINVAL;
    2224           0 :         if (!is_file_epoll(f.file))
    2225             :                 goto error_fput;
    2226             : 
    2227             :         /*
    2228             :          * At this point it is safe to assume that the "private_data" contains
    2229             :          * our own data structure.
    2230             :          */
    2231           0 :         ep = f.file->private_data;
    2232             : 
    2233             :         /* Time to fish for events ... */
    2234           0 :         error = ep_poll(ep, events, maxevents, to);
    2235             : 
    2236             : error_fput:
    2237           0 :         fdput(f);
    2238             :         return error;
    2239             : }
    2240             : 
    2241           0 : SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
    2242             :                 int, maxevents, int, timeout)
    2243             : {
    2244             :         struct timespec64 to;
    2245             : 
    2246           0 :         return do_epoll_wait(epfd, events, maxevents,
    2247             :                              ep_timeout_to_timespec(&to, timeout));
    2248             : }
    2249             : 
    2250             : /*
    2251             :  * Implement the event wait interface for the eventpoll file. It is the kernel
    2252             :  * part of the user space epoll_pwait(2).
    2253             :  */
    2254           0 : static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
    2255             :                           int maxevents, struct timespec64 *to,
    2256             :                           const sigset_t __user *sigmask, size_t sigsetsize)
    2257             : {
    2258             :         int error;
    2259             : 
    2260             :         /*
    2261             :          * If the caller wants a certain signal mask to be set during the wait,
    2262             :          * we apply it here.
    2263             :          */
    2264           0 :         error = set_user_sigmask(sigmask, sigsetsize);
    2265           0 :         if (error)
    2266             :                 return error;
    2267             : 
    2268           0 :         error = do_epoll_wait(epfd, events, maxevents, to);
    2269             : 
    2270           0 :         restore_saved_sigmask_unless(error == -EINTR);
    2271             : 
    2272           0 :         return error;
    2273             : }
    2274             : 
    2275           0 : SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
    2276             :                 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
    2277             :                 size_t, sigsetsize)
    2278             : {
    2279             :         struct timespec64 to;
    2280             : 
    2281           0 :         return do_epoll_pwait(epfd, events, maxevents,
    2282             :                               ep_timeout_to_timespec(&to, timeout),
    2283             :                               sigmask, sigsetsize);
    2284             : }
    2285             : 
    2286           0 : SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
    2287             :                 int, maxevents, const struct __kernel_timespec __user *, timeout,
    2288             :                 const sigset_t __user *, sigmask, size_t, sigsetsize)
    2289             : {
    2290           0 :         struct timespec64 ts, *to = NULL;
    2291             : 
    2292           0 :         if (timeout) {
    2293           0 :                 if (get_timespec64(&ts, timeout))
    2294             :                         return -EFAULT;
    2295           0 :                 to = &ts;
    2296           0 :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2297             :                         return -EINVAL;
    2298             :         }
    2299             : 
    2300           0 :         return do_epoll_pwait(epfd, events, maxevents, to,
    2301             :                               sigmask, sigsetsize);
    2302             : }
    2303             : 
    2304             : #ifdef CONFIG_COMPAT
    2305             : static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
    2306             :                                  int maxevents, struct timespec64 *timeout,
    2307             :                                  const compat_sigset_t __user *sigmask,
    2308             :                                  compat_size_t sigsetsize)
    2309             : {
    2310             :         long err;
    2311             : 
    2312             :         /*
    2313             :          * If the caller wants a certain signal mask to be set during the wait,
    2314             :          * we apply it here.
    2315             :          */
    2316             :         err = set_compat_user_sigmask(sigmask, sigsetsize);
    2317             :         if (err)
    2318             :                 return err;
    2319             : 
    2320             :         err = do_epoll_wait(epfd, events, maxevents, timeout);
    2321             : 
    2322             :         restore_saved_sigmask_unless(err == -EINTR);
    2323             : 
    2324             :         return err;
    2325             : }
    2326             : 
    2327             : COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
    2328             :                        struct epoll_event __user *, events,
    2329             :                        int, maxevents, int, timeout,
    2330             :                        const compat_sigset_t __user *, sigmask,
    2331             :                        compat_size_t, sigsetsize)
    2332             : {
    2333             :         struct timespec64 to;
    2334             : 
    2335             :         return do_compat_epoll_pwait(epfd, events, maxevents,
    2336             :                                      ep_timeout_to_timespec(&to, timeout),
    2337             :                                      sigmask, sigsetsize);
    2338             : }
    2339             : 
    2340             : COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
    2341             :                        struct epoll_event __user *, events,
    2342             :                        int, maxevents,
    2343             :                        const struct __kernel_timespec __user *, timeout,
    2344             :                        const compat_sigset_t __user *, sigmask,
    2345             :                        compat_size_t, sigsetsize)
    2346             : {
    2347             :         struct timespec64 ts, *to = NULL;
    2348             : 
    2349             :         if (timeout) {
    2350             :                 if (get_timespec64(&ts, timeout))
    2351             :                         return -EFAULT;
    2352             :                 to = &ts;
    2353             :                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
    2354             :                         return -EINVAL;
    2355             :         }
    2356             : 
    2357             :         return do_compat_epoll_pwait(epfd, events, maxevents, to,
    2358             :                                      sigmask, sigsetsize);
    2359             : }
    2360             : 
    2361             : #endif
    2362             : 
    2363           1 : static int __init eventpoll_init(void)
    2364             : {
    2365             :         struct sysinfo si;
    2366             : 
    2367           1 :         si_meminfo(&si);
    2368             :         /*
    2369             :          * Allows top 4% of lomem to be allocated for epoll watches (per user).
    2370             :          */
    2371           1 :         max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
    2372             :                 EP_ITEM_COST;
    2373           1 :         BUG_ON(max_user_watches < 0);
    2374             : 
    2375             :         /*
    2376             :          * We can have many thousands of epitems, so prevent this from
    2377             :          * using an extra cache line on 64-bit (and smaller) CPUs
    2378             :          */
    2379             :         BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
    2380             : 
    2381             :         /* Allocates slab cache used to allocate "struct epitem" items */
    2382           1 :         epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
    2383             :                         0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2384             : 
    2385             :         /* Allocates slab cache used to allocate "struct eppoll_entry" */
    2386           1 :         pwq_cache = kmem_cache_create("eventpoll_pwq",
    2387             :                 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2388           1 :         epoll_sysctls_init();
    2389             : 
    2390           1 :         ephead_cache = kmem_cache_create("ep_head",
    2391             :                 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
    2392             : 
    2393           1 :         return 0;
    2394             : }
    2395             : fs_initcall(eventpoll_init);

Generated by: LCOV version 1.14