Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * fs/eventfd.c
4 : *
5 : * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 : *
7 : */
8 :
9 : #include <linux/file.h>
10 : #include <linux/poll.h>
11 : #include <linux/init.h>
12 : #include <linux/fs.h>
13 : #include <linux/sched/signal.h>
14 : #include <linux/kernel.h>
15 : #include <linux/slab.h>
16 : #include <linux/list.h>
17 : #include <linux/spinlock.h>
18 : #include <linux/anon_inodes.h>
19 : #include <linux/syscalls.h>
20 : #include <linux/export.h>
21 : #include <linux/kref.h>
22 : #include <linux/eventfd.h>
23 : #include <linux/proc_fs.h>
24 : #include <linux/seq_file.h>
25 : #include <linux/idr.h>
26 : #include <linux/uio.h>
27 :
28 : static DEFINE_IDA(eventfd_ida);
29 :
30 : struct eventfd_ctx {
31 : struct kref kref;
32 : wait_queue_head_t wqh;
33 : /*
34 : * Every time that a write(2) is performed on an eventfd, the
35 : * value of the __u64 being written is added to "count" and a
36 : * wakeup is performed on "wqh". A read(2) will return the "count"
37 : * value to userspace, and will reset "count" to zero. The kernel
38 : * side eventfd_signal() also, adds to the "count" counter and
39 : * issue a wakeup.
40 : */
41 : __u64 count;
42 : unsigned int flags;
43 : int id;
44 : };
45 :
46 : /**
47 : * eventfd_signal - Adds @n to the eventfd counter.
48 : * @ctx: [in] Pointer to the eventfd context.
49 : * @n: [in] Value of the counter to be added to the eventfd internal counter.
50 : * The value cannot be negative.
51 : *
52 : * This function is supposed to be called by the kernel in paths that do not
53 : * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
54 : * value, and we signal this as overflow condition by returning a EPOLLERR
55 : * to poll(2).
56 : *
57 : * Returns the amount by which the counter was incremented. This will be less
58 : * than @n if the counter has overflowed.
59 : */
60 0 : __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
61 : {
62 : unsigned long flags;
63 :
64 : /*
65 : * Deadlock or stack overflow issues can happen if we recurse here
66 : * through waitqueue wakeup handlers. If the caller users potentially
67 : * nested waitqueues with custom wakeup handlers, then it should
68 : * check eventfd_signal_allowed() before calling this function. If
69 : * it returns false, the eventfd_signal() call should be deferred to a
70 : * safe context.
71 : */
72 0 : if (WARN_ON_ONCE(current->in_eventfd_signal))
73 : return 0;
74 :
75 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
76 0 : current->in_eventfd_signal = 1;
77 0 : if (ULLONG_MAX - ctx->count < n)
78 0 : n = ULLONG_MAX - ctx->count;
79 0 : ctx->count += n;
80 0 : if (waitqueue_active(&ctx->wqh))
81 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
82 0 : current->in_eventfd_signal = 0;
83 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
84 :
85 0 : return n;
86 : }
87 : EXPORT_SYMBOL_GPL(eventfd_signal);
88 :
89 0 : static void eventfd_free_ctx(struct eventfd_ctx *ctx)
90 : {
91 0 : if (ctx->id >= 0)
92 0 : ida_simple_remove(&eventfd_ida, ctx->id);
93 0 : kfree(ctx);
94 0 : }
95 :
96 0 : static void eventfd_free(struct kref *kref)
97 : {
98 0 : struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
99 :
100 0 : eventfd_free_ctx(ctx);
101 0 : }
102 :
103 : /**
104 : * eventfd_ctx_put - Releases a reference to the internal eventfd context.
105 : * @ctx: [in] Pointer to eventfd context.
106 : *
107 : * The eventfd context reference must have been previously acquired either
108 : * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
109 : */
110 0 : void eventfd_ctx_put(struct eventfd_ctx *ctx)
111 : {
112 0 : kref_put(&ctx->kref, eventfd_free);
113 0 : }
114 : EXPORT_SYMBOL_GPL(eventfd_ctx_put);
115 :
116 0 : static int eventfd_release(struct inode *inode, struct file *file)
117 : {
118 0 : struct eventfd_ctx *ctx = file->private_data;
119 :
120 0 : wake_up_poll(&ctx->wqh, EPOLLHUP);
121 0 : eventfd_ctx_put(ctx);
122 0 : return 0;
123 : }
124 :
125 0 : static __poll_t eventfd_poll(struct file *file, poll_table *wait)
126 : {
127 0 : struct eventfd_ctx *ctx = file->private_data;
128 0 : __poll_t events = 0;
129 : u64 count;
130 :
131 0 : poll_wait(file, &ctx->wqh, wait);
132 :
133 : /*
134 : * All writes to ctx->count occur within ctx->wqh.lock. This read
135 : * can be done outside ctx->wqh.lock because we know that poll_wait
136 : * takes that lock (through add_wait_queue) if our caller will sleep.
137 : *
138 : * The read _can_ therefore seep into add_wait_queue's critical
139 : * section, but cannot move above it! add_wait_queue's spin_lock acts
140 : * as an acquire barrier and ensures that the read be ordered properly
141 : * against the writes. The following CAN happen and is safe:
142 : *
143 : * poll write
144 : * ----------------- ------------
145 : * lock ctx->wqh.lock (in poll_wait)
146 : * count = ctx->count
147 : * __add_wait_queue
148 : * unlock ctx->wqh.lock
149 : * lock ctx->qwh.lock
150 : * ctx->count += n
151 : * if (waitqueue_active)
152 : * wake_up_locked_poll
153 : * unlock ctx->qwh.lock
154 : * eventfd_poll returns 0
155 : *
156 : * but the following, which would miss a wakeup, cannot happen:
157 : *
158 : * poll write
159 : * ----------------- ------------
160 : * count = ctx->count (INVALID!)
161 : * lock ctx->qwh.lock
162 : * ctx->count += n
163 : * **waitqueue_active is false**
164 : * **no wake_up_locked_poll!**
165 : * unlock ctx->qwh.lock
166 : * lock ctx->wqh.lock (in poll_wait)
167 : * __add_wait_queue
168 : * unlock ctx->wqh.lock
169 : * eventfd_poll returns 0
170 : */
171 0 : count = READ_ONCE(ctx->count);
172 :
173 0 : if (count > 0)
174 0 : events |= EPOLLIN;
175 0 : if (count == ULLONG_MAX)
176 0 : events |= EPOLLERR;
177 0 : if (ULLONG_MAX - 1 > count)
178 0 : events |= EPOLLOUT;
179 :
180 0 : return events;
181 : }
182 :
183 0 : void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
184 : {
185 : lockdep_assert_held(&ctx->wqh.lock);
186 :
187 0 : *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
188 0 : ctx->count -= *cnt;
189 0 : }
190 : EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
191 :
192 : /**
193 : * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
194 : * @ctx: [in] Pointer to eventfd context.
195 : * @wait: [in] Wait queue to be removed.
196 : * @cnt: [out] Pointer to the 64-bit counter value.
197 : *
198 : * Returns %0 if successful, or the following error codes:
199 : *
200 : * -EAGAIN : The operation would have blocked.
201 : *
202 : * This is used to atomically remove a wait queue entry from the eventfd wait
203 : * queue head, and read/reset the counter value.
204 : */
205 0 : int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
206 : __u64 *cnt)
207 : {
208 : unsigned long flags;
209 :
210 0 : spin_lock_irqsave(&ctx->wqh.lock, flags);
211 0 : eventfd_ctx_do_read(ctx, cnt);
212 0 : __remove_wait_queue(&ctx->wqh, wait);
213 0 : if (*cnt != 0 && waitqueue_active(&ctx->wqh))
214 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
215 0 : spin_unlock_irqrestore(&ctx->wqh.lock, flags);
216 :
217 0 : return *cnt != 0 ? 0 : -EAGAIN;
218 : }
219 : EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
220 :
221 0 : static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
222 : {
223 0 : struct file *file = iocb->ki_filp;
224 0 : struct eventfd_ctx *ctx = file->private_data;
225 0 : __u64 ucnt = 0;
226 0 : DECLARE_WAITQUEUE(wait, current);
227 :
228 0 : if (iov_iter_count(to) < sizeof(ucnt))
229 : return -EINVAL;
230 0 : spin_lock_irq(&ctx->wqh.lock);
231 0 : if (!ctx->count) {
232 0 : if ((file->f_flags & O_NONBLOCK) ||
233 0 : (iocb->ki_flags & IOCB_NOWAIT)) {
234 0 : spin_unlock_irq(&ctx->wqh.lock);
235 0 : return -EAGAIN;
236 : }
237 0 : __add_wait_queue(&ctx->wqh, &wait);
238 : for (;;) {
239 0 : set_current_state(TASK_INTERRUPTIBLE);
240 0 : if (ctx->count)
241 : break;
242 0 : if (signal_pending(current)) {
243 0 : __remove_wait_queue(&ctx->wqh, &wait);
244 0 : __set_current_state(TASK_RUNNING);
245 0 : spin_unlock_irq(&ctx->wqh.lock);
246 0 : return -ERESTARTSYS;
247 : }
248 0 : spin_unlock_irq(&ctx->wqh.lock);
249 0 : schedule();
250 0 : spin_lock_irq(&ctx->wqh.lock);
251 : }
252 0 : __remove_wait_queue(&ctx->wqh, &wait);
253 0 : __set_current_state(TASK_RUNNING);
254 : }
255 0 : eventfd_ctx_do_read(ctx, &ucnt);
256 0 : if (waitqueue_active(&ctx->wqh))
257 0 : wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
258 0 : spin_unlock_irq(&ctx->wqh.lock);
259 0 : if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
260 : return -EFAULT;
261 :
262 0 : return sizeof(ucnt);
263 : }
264 :
265 0 : static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
266 : loff_t *ppos)
267 : {
268 0 : struct eventfd_ctx *ctx = file->private_data;
269 : ssize_t res;
270 : __u64 ucnt;
271 0 : DECLARE_WAITQUEUE(wait, current);
272 :
273 0 : if (count < sizeof(ucnt))
274 : return -EINVAL;
275 0 : if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
276 : return -EFAULT;
277 0 : if (ucnt == ULLONG_MAX)
278 : return -EINVAL;
279 0 : spin_lock_irq(&ctx->wqh.lock);
280 0 : res = -EAGAIN;
281 0 : if (ULLONG_MAX - ctx->count > ucnt)
282 : res = sizeof(ucnt);
283 0 : else if (!(file->f_flags & O_NONBLOCK)) {
284 0 : __add_wait_queue(&ctx->wqh, &wait);
285 0 : for (res = 0;;) {
286 0 : set_current_state(TASK_INTERRUPTIBLE);
287 0 : if (ULLONG_MAX - ctx->count > ucnt) {
288 : res = sizeof(ucnt);
289 : break;
290 : }
291 0 : if (signal_pending(current)) {
292 : res = -ERESTARTSYS;
293 : break;
294 : }
295 0 : spin_unlock_irq(&ctx->wqh.lock);
296 0 : schedule();
297 0 : spin_lock_irq(&ctx->wqh.lock);
298 : }
299 0 : __remove_wait_queue(&ctx->wqh, &wait);
300 0 : __set_current_state(TASK_RUNNING);
301 : }
302 0 : if (likely(res > 0)) {
303 0 : ctx->count += ucnt;
304 0 : if (waitqueue_active(&ctx->wqh))
305 0 : wake_up_locked_poll(&ctx->wqh, EPOLLIN);
306 : }
307 0 : spin_unlock_irq(&ctx->wqh.lock);
308 :
309 0 : return res;
310 : }
311 :
312 : #ifdef CONFIG_PROC_FS
313 0 : static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
314 : {
315 0 : struct eventfd_ctx *ctx = f->private_data;
316 :
317 0 : spin_lock_irq(&ctx->wqh.lock);
318 0 : seq_printf(m, "eventfd-count: %16llx\n",
319 0 : (unsigned long long)ctx->count);
320 0 : spin_unlock_irq(&ctx->wqh.lock);
321 0 : seq_printf(m, "eventfd-id: %d\n", ctx->id);
322 0 : }
323 : #endif
324 :
325 : static const struct file_operations eventfd_fops = {
326 : #ifdef CONFIG_PROC_FS
327 : .show_fdinfo = eventfd_show_fdinfo,
328 : #endif
329 : .release = eventfd_release,
330 : .poll = eventfd_poll,
331 : .read_iter = eventfd_read,
332 : .write = eventfd_write,
333 : .llseek = noop_llseek,
334 : };
335 :
336 : /**
337 : * eventfd_fget - Acquire a reference of an eventfd file descriptor.
338 : * @fd: [in] Eventfd file descriptor.
339 : *
340 : * Returns a pointer to the eventfd file structure in case of success, or the
341 : * following error pointer:
342 : *
343 : * -EBADF : Invalid @fd file descriptor.
344 : * -EINVAL : The @fd file descriptor is not an eventfd file.
345 : */
346 0 : struct file *eventfd_fget(int fd)
347 : {
348 : struct file *file;
349 :
350 0 : file = fget(fd);
351 0 : if (!file)
352 : return ERR_PTR(-EBADF);
353 0 : if (file->f_op != &eventfd_fops) {
354 0 : fput(file);
355 0 : return ERR_PTR(-EINVAL);
356 : }
357 :
358 : return file;
359 : }
360 : EXPORT_SYMBOL_GPL(eventfd_fget);
361 :
362 : /**
363 : * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
364 : * @fd: [in] Eventfd file descriptor.
365 : *
366 : * Returns a pointer to the internal eventfd context, otherwise the error
367 : * pointers returned by the following functions:
368 : *
369 : * eventfd_fget
370 : */
371 0 : struct eventfd_ctx *eventfd_ctx_fdget(int fd)
372 : {
373 : struct eventfd_ctx *ctx;
374 0 : struct fd f = fdget(fd);
375 0 : if (!f.file)
376 : return ERR_PTR(-EBADF);
377 0 : ctx = eventfd_ctx_fileget(f.file);
378 0 : fdput(f);
379 : return ctx;
380 : }
381 : EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
382 :
383 : /**
384 : * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
385 : * @file: [in] Eventfd file pointer.
386 : *
387 : * Returns a pointer to the internal eventfd context, otherwise the error
388 : * pointer:
389 : *
390 : * -EINVAL : The @fd file descriptor is not an eventfd file.
391 : */
392 0 : struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
393 : {
394 : struct eventfd_ctx *ctx;
395 :
396 0 : if (file->f_op != &eventfd_fops)
397 : return ERR_PTR(-EINVAL);
398 :
399 0 : ctx = file->private_data;
400 0 : kref_get(&ctx->kref);
401 0 : return ctx;
402 : }
403 : EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
404 :
405 0 : static int do_eventfd(unsigned int count, int flags)
406 : {
407 : struct eventfd_ctx *ctx;
408 : struct file *file;
409 : int fd;
410 :
411 : /* Check the EFD_* constants for consistency. */
412 : BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
413 : BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
414 :
415 0 : if (flags & ~EFD_FLAGS_SET)
416 : return -EINVAL;
417 :
418 0 : ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
419 0 : if (!ctx)
420 : return -ENOMEM;
421 :
422 0 : kref_init(&ctx->kref);
423 0 : init_waitqueue_head(&ctx->wqh);
424 0 : ctx->count = count;
425 0 : ctx->flags = flags;
426 0 : ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
427 :
428 0 : flags &= EFD_SHARED_FCNTL_FLAGS;
429 0 : flags |= O_RDWR;
430 0 : fd = get_unused_fd_flags(flags);
431 0 : if (fd < 0)
432 : goto err;
433 :
434 0 : file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
435 0 : if (IS_ERR(file)) {
436 0 : put_unused_fd(fd);
437 0 : fd = PTR_ERR(file);
438 0 : goto err;
439 : }
440 :
441 0 : file->f_mode |= FMODE_NOWAIT;
442 0 : fd_install(fd, file);
443 0 : return fd;
444 : err:
445 0 : eventfd_free_ctx(ctx);
446 0 : return fd;
447 : }
448 :
449 0 : SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
450 : {
451 0 : return do_eventfd(count, flags);
452 : }
453 :
454 0 : SYSCALL_DEFINE1(eventfd, unsigned int, count)
455 : {
456 0 : return do_eventfd(count, 0);
457 : }
458 :
|