Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * Copyright (C) 1991, 1992 Linus Torvalds
4 : * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5 : * Copyright (C) 2016 - 2020 Christoph Hellwig
6 : */
7 :
8 : #include <linux/init.h>
9 : #include <linux/mm.h>
10 : #include <linux/slab.h>
11 : #include <linux/kmod.h>
12 : #include <linux/major.h>
13 : #include <linux/device_cgroup.h>
14 : #include <linux/blkdev.h>
15 : #include <linux/blk-integrity.h>
16 : #include <linux/backing-dev.h>
17 : #include <linux/module.h>
18 : #include <linux/blkpg.h>
19 : #include <linux/magic.h>
20 : #include <linux/buffer_head.h>
21 : #include <linux/swap.h>
22 : #include <linux/writeback.h>
23 : #include <linux/mount.h>
24 : #include <linux/pseudo_fs.h>
25 : #include <linux/uio.h>
26 : #include <linux/namei.h>
27 : #include <linux/part_stat.h>
28 : #include <linux/uaccess.h>
29 : #include "../fs/internal.h"
30 : #include "blk.h"
31 :
32 : struct bdev_inode {
33 : struct block_device bdev;
34 : struct inode vfs_inode;
35 : };
36 :
37 : static inline struct bdev_inode *BDEV_I(struct inode *inode)
38 : {
39 0 : return container_of(inode, struct bdev_inode, vfs_inode);
40 : }
41 :
42 0 : struct block_device *I_BDEV(struct inode *inode)
43 : {
44 0 : return &BDEV_I(inode)->bdev;
45 : }
46 : EXPORT_SYMBOL(I_BDEV);
47 :
48 0 : static void bdev_write_inode(struct block_device *bdev)
49 : {
50 0 : struct inode *inode = bdev->bd_inode;
51 : int ret;
52 :
53 0 : spin_lock(&inode->i_lock);
54 0 : while (inode->i_state & I_DIRTY) {
55 0 : spin_unlock(&inode->i_lock);
56 0 : ret = write_inode_now(inode, true);
57 0 : if (ret) {
58 : char name[BDEVNAME_SIZE];
59 0 : pr_warn_ratelimited("VFS: Dirty inode writeback failed "
60 : "for block device %s (err=%d).\n",
61 : bdevname(bdev, name), ret);
62 : }
63 0 : spin_lock(&inode->i_lock);
64 : }
65 0 : spin_unlock(&inode->i_lock);
66 0 : }
67 :
68 : /* Kill _all_ buffers and pagecache , dirty or not.. */
69 0 : static void kill_bdev(struct block_device *bdev)
70 : {
71 0 : struct address_space *mapping = bdev->bd_inode->i_mapping;
72 :
73 0 : if (mapping_empty(mapping))
74 : return;
75 :
76 0 : invalidate_bh_lrus();
77 0 : truncate_inode_pages(mapping, 0);
78 : }
79 :
80 : /* Invalidate clean unused buffers and pagecache. */
81 0 : void invalidate_bdev(struct block_device *bdev)
82 : {
83 0 : struct address_space *mapping = bdev->bd_inode->i_mapping;
84 :
85 0 : if (mapping->nrpages) {
86 0 : invalidate_bh_lrus();
87 0 : lru_add_drain_all(); /* make sure all lru add caches are flushed */
88 0 : invalidate_mapping_pages(mapping, 0, -1);
89 : }
90 0 : }
91 : EXPORT_SYMBOL(invalidate_bdev);
92 :
93 : /*
94 : * Drop all buffers & page cache for given bdev range. This function bails
95 : * with error if bdev has other exclusive owner (such as filesystem).
96 : */
97 0 : int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
98 : loff_t lstart, loff_t lend)
99 : {
100 : /*
101 : * If we don't hold exclusive handle for the device, upgrade to it
102 : * while we discard the buffer cache to avoid discarding buffers
103 : * under live filesystem.
104 : */
105 0 : if (!(mode & FMODE_EXCL)) {
106 0 : int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
107 0 : if (err)
108 : goto invalidate;
109 : }
110 :
111 0 : truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
112 0 : if (!(mode & FMODE_EXCL))
113 : bd_abort_claiming(bdev, truncate_bdev_range);
114 : return 0;
115 :
116 : invalidate:
117 : /*
118 : * Someone else has handle exclusively open. Try invalidating instead.
119 : * The 'end' argument is inclusive so the rounding is safe.
120 : */
121 0 : return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
122 0 : lstart >> PAGE_SHIFT,
123 0 : lend >> PAGE_SHIFT);
124 : }
125 :
126 0 : static void set_init_blocksize(struct block_device *bdev)
127 : {
128 0 : unsigned int bsize = bdev_logical_block_size(bdev);
129 0 : loff_t size = i_size_read(bdev->bd_inode);
130 :
131 0 : while (bsize < PAGE_SIZE) {
132 0 : if (size & bsize)
133 : break;
134 0 : bsize <<= 1;
135 : }
136 0 : bdev->bd_inode->i_blkbits = blksize_bits(bsize);
137 0 : }
138 :
139 0 : int set_blocksize(struct block_device *bdev, int size)
140 : {
141 : /* Size must be a power of two, and between 512 and PAGE_SIZE */
142 0 : if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
143 : return -EINVAL;
144 :
145 : /* Size cannot be smaller than the size supported by the device */
146 0 : if (size < bdev_logical_block_size(bdev))
147 : return -EINVAL;
148 :
149 : /* Don't change the size if it is same as current */
150 0 : if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
151 0 : sync_blockdev(bdev);
152 0 : bdev->bd_inode->i_blkbits = blksize_bits(size);
153 0 : kill_bdev(bdev);
154 : }
155 : return 0;
156 : }
157 :
158 : EXPORT_SYMBOL(set_blocksize);
159 :
160 0 : int sb_set_blocksize(struct super_block *sb, int size)
161 : {
162 0 : if (set_blocksize(sb->s_bdev, size))
163 : return 0;
164 : /* If we get here, we know size is power of two
165 : * and it's value is between 512 and PAGE_SIZE */
166 0 : sb->s_blocksize = size;
167 0 : sb->s_blocksize_bits = blksize_bits(size);
168 0 : return sb->s_blocksize;
169 : }
170 :
171 : EXPORT_SYMBOL(sb_set_blocksize);
172 :
173 0 : int sb_min_blocksize(struct super_block *sb, int size)
174 : {
175 0 : int minsize = bdev_logical_block_size(sb->s_bdev);
176 0 : if (size < minsize)
177 0 : size = minsize;
178 0 : return sb_set_blocksize(sb, size);
179 : }
180 :
181 : EXPORT_SYMBOL(sb_min_blocksize);
182 :
183 0 : int sync_blockdev_nowait(struct block_device *bdev)
184 : {
185 0 : if (!bdev)
186 : return 0;
187 0 : return filemap_flush(bdev->bd_inode->i_mapping);
188 : }
189 : EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
190 :
191 : /*
192 : * Write out and wait upon all the dirty data associated with a block
193 : * device via its mapping. Does not take the superblock lock.
194 : */
195 0 : int sync_blockdev(struct block_device *bdev)
196 : {
197 0 : if (!bdev)
198 : return 0;
199 0 : return filemap_write_and_wait(bdev->bd_inode->i_mapping);
200 : }
201 : EXPORT_SYMBOL(sync_blockdev);
202 :
203 : /*
204 : * Write out and wait upon all dirty data associated with this
205 : * device. Filesystem data as well as the underlying block
206 : * device. Takes the superblock lock.
207 : */
208 0 : int fsync_bdev(struct block_device *bdev)
209 : {
210 0 : struct super_block *sb = get_super(bdev);
211 0 : if (sb) {
212 0 : int res = sync_filesystem(sb);
213 0 : drop_super(sb);
214 0 : return res;
215 : }
216 : return sync_blockdev(bdev);
217 : }
218 : EXPORT_SYMBOL(fsync_bdev);
219 :
220 : /**
221 : * freeze_bdev -- lock a filesystem and force it into a consistent state
222 : * @bdev: blockdevice to lock
223 : *
224 : * If a superblock is found on this device, we take the s_umount semaphore
225 : * on it to make sure nobody unmounts until the snapshot creation is done.
226 : * The reference counter (bd_fsfreeze_count) guarantees that only the last
227 : * unfreeze process can unfreeze the frozen filesystem actually when multiple
228 : * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
229 : * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
230 : * actually.
231 : */
232 0 : int freeze_bdev(struct block_device *bdev)
233 : {
234 : struct super_block *sb;
235 0 : int error = 0;
236 :
237 0 : mutex_lock(&bdev->bd_fsfreeze_mutex);
238 0 : if (++bdev->bd_fsfreeze_count > 1)
239 : goto done;
240 :
241 0 : sb = get_active_super(bdev);
242 0 : if (!sb)
243 : goto sync;
244 0 : if (sb->s_op->freeze_super)
245 0 : error = sb->s_op->freeze_super(sb);
246 : else
247 0 : error = freeze_super(sb);
248 0 : deactivate_super(sb);
249 :
250 0 : if (error) {
251 0 : bdev->bd_fsfreeze_count--;
252 0 : goto done;
253 : }
254 0 : bdev->bd_fsfreeze_sb = sb;
255 :
256 : sync:
257 : sync_blockdev(bdev);
258 : done:
259 0 : mutex_unlock(&bdev->bd_fsfreeze_mutex);
260 0 : return error;
261 : }
262 : EXPORT_SYMBOL(freeze_bdev);
263 :
264 : /**
265 : * thaw_bdev -- unlock filesystem
266 : * @bdev: blockdevice to unlock
267 : *
268 : * Unlocks the filesystem and marks it writeable again after freeze_bdev().
269 : */
270 0 : int thaw_bdev(struct block_device *bdev)
271 : {
272 : struct super_block *sb;
273 0 : int error = -EINVAL;
274 :
275 0 : mutex_lock(&bdev->bd_fsfreeze_mutex);
276 0 : if (!bdev->bd_fsfreeze_count)
277 : goto out;
278 :
279 0 : error = 0;
280 0 : if (--bdev->bd_fsfreeze_count > 0)
281 : goto out;
282 :
283 0 : sb = bdev->bd_fsfreeze_sb;
284 0 : if (!sb)
285 : goto out;
286 :
287 0 : if (sb->s_op->thaw_super)
288 0 : error = sb->s_op->thaw_super(sb);
289 : else
290 0 : error = thaw_super(sb);
291 0 : if (error)
292 0 : bdev->bd_fsfreeze_count++;
293 : else
294 0 : bdev->bd_fsfreeze_sb = NULL;
295 : out:
296 0 : mutex_unlock(&bdev->bd_fsfreeze_mutex);
297 0 : return error;
298 : }
299 : EXPORT_SYMBOL(thaw_bdev);
300 :
301 : /**
302 : * bdev_read_page() - Start reading a page from a block device
303 : * @bdev: The device to read the page from
304 : * @sector: The offset on the device to read the page to (need not be aligned)
305 : * @page: The page to read
306 : *
307 : * On entry, the page should be locked. It will be unlocked when the page
308 : * has been read. If the block driver implements rw_page synchronously,
309 : * that will be true on exit from this function, but it need not be.
310 : *
311 : * Errors returned by this function are usually "soft", eg out of memory, or
312 : * queue full; callers should try a different route to read this page rather
313 : * than propagate an error back up the stack.
314 : *
315 : * Return: negative errno if an error occurs, 0 if submission was successful.
316 : */
317 0 : int bdev_read_page(struct block_device *bdev, sector_t sector,
318 : struct page *page)
319 : {
320 0 : const struct block_device_operations *ops = bdev->bd_disk->fops;
321 0 : int result = -EOPNOTSUPP;
322 :
323 0 : if (!ops->rw_page || bdev_get_integrity(bdev))
324 : return result;
325 :
326 0 : result = blk_queue_enter(bdev_get_queue(bdev), 0);
327 0 : if (result)
328 : return result;
329 0 : result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
330 : REQ_OP_READ);
331 0 : blk_queue_exit(bdev_get_queue(bdev));
332 0 : return result;
333 : }
334 :
335 : /**
336 : * bdev_write_page() - Start writing a page to a block device
337 : * @bdev: The device to write the page to
338 : * @sector: The offset on the device to write the page to (need not be aligned)
339 : * @page: The page to write
340 : * @wbc: The writeback_control for the write
341 : *
342 : * On entry, the page should be locked and not currently under writeback.
343 : * On exit, if the write started successfully, the page will be unlocked and
344 : * under writeback. If the write failed already (eg the driver failed to
345 : * queue the page to the device), the page will still be locked. If the
346 : * caller is a ->writepage implementation, it will need to unlock the page.
347 : *
348 : * Errors returned by this function are usually "soft", eg out of memory, or
349 : * queue full; callers should try a different route to write this page rather
350 : * than propagate an error back up the stack.
351 : *
352 : * Return: negative errno if an error occurs, 0 if submission was successful.
353 : */
354 0 : int bdev_write_page(struct block_device *bdev, sector_t sector,
355 : struct page *page, struct writeback_control *wbc)
356 : {
357 : int result;
358 0 : const struct block_device_operations *ops = bdev->bd_disk->fops;
359 :
360 0 : if (!ops->rw_page || bdev_get_integrity(bdev))
361 : return -EOPNOTSUPP;
362 0 : result = blk_queue_enter(bdev_get_queue(bdev), 0);
363 0 : if (result)
364 : return result;
365 :
366 0 : set_page_writeback(page);
367 0 : result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
368 : REQ_OP_WRITE);
369 0 : if (result) {
370 0 : end_page_writeback(page);
371 : } else {
372 0 : clean_page_buffers(page);
373 0 : unlock_page(page);
374 : }
375 0 : blk_queue_exit(bdev_get_queue(bdev));
376 0 : return result;
377 : }
378 :
379 : /*
380 : * pseudo-fs
381 : */
382 :
383 : static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
384 : static struct kmem_cache * bdev_cachep __read_mostly;
385 :
386 1 : static struct inode *bdev_alloc_inode(struct super_block *sb)
387 : {
388 2 : struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
389 :
390 1 : if (!ei)
391 : return NULL;
392 1 : memset(&ei->bdev, 0, sizeof(ei->bdev));
393 1 : return &ei->vfs_inode;
394 : }
395 :
396 0 : static void bdev_free_inode(struct inode *inode)
397 : {
398 0 : struct block_device *bdev = I_BDEV(inode);
399 :
400 0 : free_percpu(bdev->bd_stats);
401 0 : kfree(bdev->bd_meta_info);
402 :
403 0 : if (!bdev_is_partition(bdev)) {
404 0 : if (bdev->bd_disk && bdev->bd_disk->bdi)
405 0 : bdi_put(bdev->bd_disk->bdi);
406 0 : kfree(bdev->bd_disk);
407 : }
408 :
409 0 : if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
410 0 : blk_free_ext_minor(MINOR(bdev->bd_dev));
411 :
412 0 : kmem_cache_free(bdev_cachep, BDEV_I(inode));
413 0 : }
414 :
415 11 : static void init_once(void *data)
416 : {
417 11 : struct bdev_inode *ei = data;
418 :
419 11 : inode_init_once(&ei->vfs_inode);
420 11 : }
421 :
422 0 : static void bdev_evict_inode(struct inode *inode)
423 : {
424 0 : truncate_inode_pages_final(&inode->i_data);
425 0 : invalidate_inode_buffers(inode); /* is it needed here? */
426 0 : clear_inode(inode);
427 0 : }
428 :
429 : static const struct super_operations bdev_sops = {
430 : .statfs = simple_statfs,
431 : .alloc_inode = bdev_alloc_inode,
432 : .free_inode = bdev_free_inode,
433 : .drop_inode = generic_delete_inode,
434 : .evict_inode = bdev_evict_inode,
435 : };
436 :
437 1 : static int bd_init_fs_context(struct fs_context *fc)
438 : {
439 1 : struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
440 1 : if (!ctx)
441 : return -ENOMEM;
442 1 : fc->s_iflags |= SB_I_CGROUPWB;
443 1 : ctx->ops = &bdev_sops;
444 1 : return 0;
445 : }
446 :
447 : static struct file_system_type bd_type = {
448 : .name = "bdev",
449 : .init_fs_context = bd_init_fs_context,
450 : .kill_sb = kill_anon_super,
451 : };
452 :
453 : struct super_block *blockdev_superblock __read_mostly;
454 : EXPORT_SYMBOL_GPL(blockdev_superblock);
455 :
456 1 : void __init bdev_cache_init(void)
457 : {
458 : int err;
459 : static struct vfsmount *bd_mnt;
460 :
461 1 : bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
462 : 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
463 : SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
464 : init_once);
465 1 : err = register_filesystem(&bd_type);
466 1 : if (err)
467 0 : panic("Cannot register bdev pseudo-fs");
468 1 : bd_mnt = kern_mount(&bd_type);
469 2 : if (IS_ERR(bd_mnt))
470 0 : panic("Cannot create bdev pseudo-fs");
471 1 : blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
472 1 : }
473 :
474 0 : struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
475 : {
476 : struct block_device *bdev;
477 : struct inode *inode;
478 :
479 0 : inode = new_inode(blockdev_superblock);
480 0 : if (!inode)
481 : return NULL;
482 0 : inode->i_mode = S_IFBLK;
483 0 : inode->i_rdev = 0;
484 0 : inode->i_data.a_ops = &def_blk_aops;
485 0 : mapping_set_gfp_mask(&inode->i_data, GFP_USER);
486 :
487 0 : bdev = I_BDEV(inode);
488 0 : mutex_init(&bdev->bd_fsfreeze_mutex);
489 0 : spin_lock_init(&bdev->bd_size_lock);
490 0 : bdev->bd_partno = partno;
491 0 : bdev->bd_inode = inode;
492 0 : bdev->bd_queue = disk->queue;
493 0 : bdev->bd_stats = alloc_percpu(struct disk_stats);
494 0 : if (!bdev->bd_stats) {
495 0 : iput(inode);
496 0 : return NULL;
497 : }
498 0 : bdev->bd_disk = disk;
499 0 : return bdev;
500 : }
501 :
502 0 : void bdev_add(struct block_device *bdev, dev_t dev)
503 : {
504 0 : bdev->bd_dev = dev;
505 0 : bdev->bd_inode->i_rdev = dev;
506 0 : bdev->bd_inode->i_ino = dev;
507 0 : insert_inode_hash(bdev->bd_inode);
508 0 : }
509 :
510 3 : long nr_blockdev_pages(void)
511 : {
512 : struct inode *inode;
513 3 : long ret = 0;
514 :
515 6 : spin_lock(&blockdev_superblock->s_inode_list_lock);
516 6 : list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
517 3 : ret += inode->i_mapping->nrpages;
518 6 : spin_unlock(&blockdev_superblock->s_inode_list_lock);
519 :
520 3 : return ret;
521 : }
522 :
523 : /**
524 : * bd_may_claim - test whether a block device can be claimed
525 : * @bdev: block device of interest
526 : * @whole: whole block device containing @bdev, may equal @bdev
527 : * @holder: holder trying to claim @bdev
528 : *
529 : * Test whether @bdev can be claimed by @holder.
530 : *
531 : * CONTEXT:
532 : * spin_lock(&bdev_lock).
533 : *
534 : * RETURNS:
535 : * %true if @bdev can be claimed, %false otherwise.
536 : */
537 0 : static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
538 : void *holder)
539 : {
540 0 : if (bdev->bd_holder == holder)
541 : return true; /* already a holder */
542 0 : else if (bdev->bd_holder != NULL)
543 : return false; /* held by someone else */
544 0 : else if (whole == bdev)
545 : return true; /* is a whole device which isn't held */
546 :
547 0 : else if (whole->bd_holder == bd_may_claim)
548 : return true; /* is a partition of a device that is being partitioned */
549 0 : else if (whole->bd_holder != NULL)
550 : return false; /* is a partition of a held device */
551 : else
552 0 : return true; /* is a partition of an un-held device */
553 : }
554 :
555 : /**
556 : * bd_prepare_to_claim - claim a block device
557 : * @bdev: block device of interest
558 : * @holder: holder trying to claim @bdev
559 : *
560 : * Claim @bdev. This function fails if @bdev is already claimed by another
561 : * holder and waits if another claiming is in progress. return, the caller
562 : * has ownership of bd_claiming and bd_holder[s].
563 : *
564 : * RETURNS:
565 : * 0 if @bdev can be claimed, -EBUSY otherwise.
566 : */
567 0 : int bd_prepare_to_claim(struct block_device *bdev, void *holder)
568 : {
569 0 : struct block_device *whole = bdev_whole(bdev);
570 :
571 0 : if (WARN_ON_ONCE(!holder))
572 : return -EINVAL;
573 : retry:
574 0 : spin_lock(&bdev_lock);
575 : /* if someone else claimed, fail */
576 0 : if (!bd_may_claim(bdev, whole, holder)) {
577 0 : spin_unlock(&bdev_lock);
578 0 : return -EBUSY;
579 : }
580 :
581 : /* if claiming is already in progress, wait for it to finish */
582 0 : if (whole->bd_claiming) {
583 0 : wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
584 0 : DEFINE_WAIT(wait);
585 :
586 0 : prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
587 0 : spin_unlock(&bdev_lock);
588 0 : schedule();
589 0 : finish_wait(wq, &wait);
590 : goto retry;
591 : }
592 :
593 : /* yay, all mine */
594 0 : whole->bd_claiming = holder;
595 0 : spin_unlock(&bdev_lock);
596 0 : return 0;
597 : }
598 : EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
599 :
600 0 : static void bd_clear_claiming(struct block_device *whole, void *holder)
601 : {
602 : lockdep_assert_held(&bdev_lock);
603 : /* tell others that we're done */
604 0 : BUG_ON(whole->bd_claiming != holder);
605 0 : whole->bd_claiming = NULL;
606 0 : wake_up_bit(&whole->bd_claiming, 0);
607 0 : }
608 :
609 : /**
610 : * bd_finish_claiming - finish claiming of a block device
611 : * @bdev: block device of interest
612 : * @holder: holder that has claimed @bdev
613 : *
614 : * Finish exclusive open of a block device. Mark the device as exlusively
615 : * open by the holder and wake up all waiters for exclusive open to finish.
616 : */
617 0 : static void bd_finish_claiming(struct block_device *bdev, void *holder)
618 : {
619 0 : struct block_device *whole = bdev_whole(bdev);
620 :
621 0 : spin_lock(&bdev_lock);
622 0 : BUG_ON(!bd_may_claim(bdev, whole, holder));
623 : /*
624 : * Note that for a whole device bd_holders will be incremented twice,
625 : * and bd_holder will be set to bd_may_claim before being set to holder
626 : */
627 0 : whole->bd_holders++;
628 0 : whole->bd_holder = bd_may_claim;
629 0 : bdev->bd_holders++;
630 0 : bdev->bd_holder = holder;
631 0 : bd_clear_claiming(whole, holder);
632 0 : spin_unlock(&bdev_lock);
633 0 : }
634 :
635 : /**
636 : * bd_abort_claiming - abort claiming of a block device
637 : * @bdev: block device of interest
638 : * @holder: holder that has claimed @bdev
639 : *
640 : * Abort claiming of a block device when the exclusive open failed. This can be
641 : * also used when exclusive open is not actually desired and we just needed
642 : * to block other exclusive openers for a while.
643 : */
644 0 : void bd_abort_claiming(struct block_device *bdev, void *holder)
645 : {
646 0 : spin_lock(&bdev_lock);
647 0 : bd_clear_claiming(bdev_whole(bdev), holder);
648 0 : spin_unlock(&bdev_lock);
649 0 : }
650 : EXPORT_SYMBOL(bd_abort_claiming);
651 :
652 0 : static void blkdev_flush_mapping(struct block_device *bdev)
653 : {
654 0 : WARN_ON_ONCE(bdev->bd_holders);
655 0 : sync_blockdev(bdev);
656 0 : kill_bdev(bdev);
657 0 : bdev_write_inode(bdev);
658 0 : }
659 :
660 0 : static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
661 : {
662 0 : struct gendisk *disk = bdev->bd_disk;
663 : int ret;
664 :
665 0 : if (disk->fops->open) {
666 0 : ret = disk->fops->open(bdev, mode);
667 0 : if (ret) {
668 : /* avoid ghost partitions on a removed medium */
669 0 : if (ret == -ENOMEDIUM &&
670 0 : test_bit(GD_NEED_PART_SCAN, &disk->state))
671 0 : bdev_disk_changed(disk, true);
672 : return ret;
673 : }
674 : }
675 :
676 0 : if (!bdev->bd_openers)
677 0 : set_init_blocksize(bdev);
678 0 : if (test_bit(GD_NEED_PART_SCAN, &disk->state))
679 0 : bdev_disk_changed(disk, false);
680 0 : bdev->bd_openers++;
681 0 : return 0;
682 : }
683 :
684 0 : static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
685 : {
686 0 : if (!--bdev->bd_openers)
687 0 : blkdev_flush_mapping(bdev);
688 0 : if (bdev->bd_disk->fops->release)
689 0 : bdev->bd_disk->fops->release(bdev->bd_disk, mode);
690 0 : }
691 :
692 0 : static int blkdev_get_part(struct block_device *part, fmode_t mode)
693 : {
694 0 : struct gendisk *disk = part->bd_disk;
695 : int ret;
696 :
697 0 : if (part->bd_openers)
698 : goto done;
699 :
700 0 : ret = blkdev_get_whole(bdev_whole(part), mode);
701 0 : if (ret)
702 : return ret;
703 :
704 0 : ret = -ENXIO;
705 0 : if (!bdev_nr_sectors(part))
706 : goto out_blkdev_put;
707 :
708 0 : disk->open_partitions++;
709 0 : set_init_blocksize(part);
710 : done:
711 0 : part->bd_openers++;
712 0 : return 0;
713 :
714 : out_blkdev_put:
715 0 : blkdev_put_whole(bdev_whole(part), mode);
716 0 : return ret;
717 : }
718 :
719 0 : static void blkdev_put_part(struct block_device *part, fmode_t mode)
720 : {
721 0 : struct block_device *whole = bdev_whole(part);
722 :
723 0 : if (--part->bd_openers)
724 : return;
725 0 : blkdev_flush_mapping(part);
726 0 : whole->bd_disk->open_partitions--;
727 0 : blkdev_put_whole(whole, mode);
728 : }
729 :
730 0 : struct block_device *blkdev_get_no_open(dev_t dev)
731 : {
732 : struct block_device *bdev;
733 : struct inode *inode;
734 :
735 0 : inode = ilookup(blockdev_superblock, dev);
736 0 : if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
737 0 : blk_request_module(dev);
738 0 : inode = ilookup(blockdev_superblock, dev);
739 0 : if (inode)
740 0 : pr_warn_ratelimited(
741 : "block device autoloading is deprecated and will be removed.\n");
742 : }
743 0 : if (!inode)
744 : return NULL;
745 :
746 : /* switch from the inode reference to a device mode one: */
747 0 : bdev = &BDEV_I(inode)->bdev;
748 0 : if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
749 0 : bdev = NULL;
750 0 : iput(inode);
751 0 : return bdev;
752 : }
753 :
754 0 : void blkdev_put_no_open(struct block_device *bdev)
755 : {
756 0 : put_device(&bdev->bd_device);
757 0 : }
758 :
759 : /**
760 : * blkdev_get_by_dev - open a block device by device number
761 : * @dev: device number of block device to open
762 : * @mode: FMODE_* mask
763 : * @holder: exclusive holder identifier
764 : *
765 : * Open the block device described by device number @dev. If @mode includes
766 : * %FMODE_EXCL, the block device is opened with exclusive access. Specifying
767 : * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for
768 : * the same @holder.
769 : *
770 : * Use this interface ONLY if you really do not have anything better - i.e. when
771 : * you are behind a truly sucky interface and all you are given is a device
772 : * number. Everything else should use blkdev_get_by_path().
773 : *
774 : * CONTEXT:
775 : * Might sleep.
776 : *
777 : * RETURNS:
778 : * Reference to the block_device on success, ERR_PTR(-errno) on failure.
779 : */
780 0 : struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
781 : {
782 0 : bool unblock_events = true;
783 : struct block_device *bdev;
784 : struct gendisk *disk;
785 : int ret;
786 :
787 0 : ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
788 0 : MAJOR(dev), MINOR(dev),
789 0 : ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
790 0 : ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
791 : if (ret)
792 : return ERR_PTR(ret);
793 :
794 0 : bdev = blkdev_get_no_open(dev);
795 0 : if (!bdev)
796 : return ERR_PTR(-ENXIO);
797 0 : disk = bdev->bd_disk;
798 :
799 0 : if (mode & FMODE_EXCL) {
800 0 : ret = bd_prepare_to_claim(bdev, holder);
801 0 : if (ret)
802 : goto put_blkdev;
803 : }
804 :
805 0 : disk_block_events(disk);
806 :
807 0 : mutex_lock(&disk->open_mutex);
808 0 : ret = -ENXIO;
809 0 : if (!disk_live(disk))
810 : goto abort_claiming;
811 0 : if (!try_module_get(disk->fops->owner))
812 : goto abort_claiming;
813 0 : if (bdev_is_partition(bdev))
814 0 : ret = blkdev_get_part(bdev, mode);
815 : else
816 0 : ret = blkdev_get_whole(bdev, mode);
817 0 : if (ret)
818 : goto put_module;
819 0 : if (mode & FMODE_EXCL) {
820 0 : bd_finish_claiming(bdev, holder);
821 :
822 : /*
823 : * Block event polling for write claims if requested. Any write
824 : * holder makes the write_holder state stick until all are
825 : * released. This is good enough and tracking individual
826 : * writeable reference is too fragile given the way @mode is
827 : * used in blkdev_get/put().
828 : */
829 0 : if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
830 0 : (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
831 0 : bdev->bd_write_holder = true;
832 0 : unblock_events = false;
833 : }
834 : }
835 0 : mutex_unlock(&disk->open_mutex);
836 :
837 0 : if (unblock_events)
838 0 : disk_unblock_events(disk);
839 : return bdev;
840 : put_module:
841 : module_put(disk->fops->owner);
842 : abort_claiming:
843 0 : if (mode & FMODE_EXCL)
844 : bd_abort_claiming(bdev, holder);
845 0 : mutex_unlock(&disk->open_mutex);
846 0 : disk_unblock_events(disk);
847 : put_blkdev:
848 0 : blkdev_put_no_open(bdev);
849 0 : return ERR_PTR(ret);
850 : }
851 : EXPORT_SYMBOL(blkdev_get_by_dev);
852 :
853 : /**
854 : * blkdev_get_by_path - open a block device by name
855 : * @path: path to the block device to open
856 : * @mode: FMODE_* mask
857 : * @holder: exclusive holder identifier
858 : *
859 : * Open the block device described by the device file at @path. If @mode
860 : * includes %FMODE_EXCL, the block device is opened with exclusive access.
861 : * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may
862 : * nest for the same @holder.
863 : *
864 : * CONTEXT:
865 : * Might sleep.
866 : *
867 : * RETURNS:
868 : * Reference to the block_device on success, ERR_PTR(-errno) on failure.
869 : */
870 0 : struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
871 : void *holder)
872 : {
873 : struct block_device *bdev;
874 : dev_t dev;
875 : int error;
876 :
877 0 : error = lookup_bdev(path, &dev);
878 0 : if (error)
879 0 : return ERR_PTR(error);
880 :
881 0 : bdev = blkdev_get_by_dev(dev, mode, holder);
882 0 : if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
883 0 : blkdev_put(bdev, mode);
884 0 : return ERR_PTR(-EACCES);
885 : }
886 :
887 : return bdev;
888 : }
889 : EXPORT_SYMBOL(blkdev_get_by_path);
890 :
891 0 : void blkdev_put(struct block_device *bdev, fmode_t mode)
892 : {
893 0 : struct gendisk *disk = bdev->bd_disk;
894 :
895 : /*
896 : * Sync early if it looks like we're the last one. If someone else
897 : * opens the block device between now and the decrement of bd_openers
898 : * then we did a sync that we didn't need to, but that's not the end
899 : * of the world and we want to avoid long (could be several minute)
900 : * syncs while holding the mutex.
901 : */
902 0 : if (bdev->bd_openers == 1)
903 : sync_blockdev(bdev);
904 :
905 0 : mutex_lock(&disk->open_mutex);
906 0 : if (mode & FMODE_EXCL) {
907 0 : struct block_device *whole = bdev_whole(bdev);
908 : bool bdev_free;
909 :
910 : /*
911 : * Release a claim on the device. The holder fields
912 : * are protected with bdev_lock. open_mutex is to
913 : * synchronize disk_holder unlinking.
914 : */
915 0 : spin_lock(&bdev_lock);
916 :
917 0 : WARN_ON_ONCE(--bdev->bd_holders < 0);
918 0 : WARN_ON_ONCE(--whole->bd_holders < 0);
919 :
920 0 : if ((bdev_free = !bdev->bd_holders))
921 0 : bdev->bd_holder = NULL;
922 0 : if (!whole->bd_holders)
923 0 : whole->bd_holder = NULL;
924 :
925 0 : spin_unlock(&bdev_lock);
926 :
927 : /*
928 : * If this was the last claim, remove holder link and
929 : * unblock evpoll if it was a write holder.
930 : */
931 0 : if (bdev_free && bdev->bd_write_holder) {
932 0 : disk_unblock_events(disk);
933 0 : bdev->bd_write_holder = false;
934 : }
935 : }
936 :
937 : /*
938 : * Trigger event checking and tell drivers to flush MEDIA_CHANGE
939 : * event. This is to ensure detection of media removal commanded
940 : * from userland - e.g. eject(1).
941 : */
942 0 : disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
943 :
944 0 : if (bdev_is_partition(bdev))
945 0 : blkdev_put_part(bdev, mode);
946 : else
947 0 : blkdev_put_whole(bdev, mode);
948 0 : mutex_unlock(&disk->open_mutex);
949 :
950 0 : module_put(disk->fops->owner);
951 0 : blkdev_put_no_open(bdev);
952 0 : }
953 : EXPORT_SYMBOL(blkdev_put);
954 :
955 : /**
956 : * lookup_bdev() - Look up a struct block_device by name.
957 : * @pathname: Name of the block device in the filesystem.
958 : * @dev: Pointer to the block device's dev_t, if found.
959 : *
960 : * Lookup the block device's dev_t at @pathname in the current
961 : * namespace if possible and return it in @dev.
962 : *
963 : * Context: May sleep.
964 : * Return: 0 if succeeded, negative errno otherwise.
965 : */
966 0 : int lookup_bdev(const char *pathname, dev_t *dev)
967 : {
968 : struct inode *inode;
969 : struct path path;
970 : int error;
971 :
972 0 : if (!pathname || !*pathname)
973 : return -EINVAL;
974 :
975 0 : error = kern_path(pathname, LOOKUP_FOLLOW, &path);
976 0 : if (error)
977 : return error;
978 :
979 0 : inode = d_backing_inode(path.dentry);
980 0 : error = -ENOTBLK;
981 0 : if (!S_ISBLK(inode->i_mode))
982 : goto out_path_put;
983 0 : error = -EACCES;
984 0 : if (!may_open_dev(&path))
985 : goto out_path_put;
986 :
987 0 : *dev = inode->i_rdev;
988 0 : error = 0;
989 : out_path_put:
990 0 : path_put(&path);
991 0 : return error;
992 : }
993 : EXPORT_SYMBOL(lookup_bdev);
994 :
995 0 : int __invalidate_device(struct block_device *bdev, bool kill_dirty)
996 : {
997 0 : struct super_block *sb = get_super(bdev);
998 0 : int res = 0;
999 :
1000 0 : if (sb) {
1001 : /*
1002 : * no need to lock the super, get_super holds the
1003 : * read mutex so the filesystem cannot go away
1004 : * under us (->put_super runs with the write lock
1005 : * hold).
1006 : */
1007 0 : shrink_dcache_sb(sb);
1008 0 : res = invalidate_inodes(sb, kill_dirty);
1009 0 : drop_super(sb);
1010 : }
1011 0 : invalidate_bdev(bdev);
1012 0 : return res;
1013 : }
1014 : EXPORT_SYMBOL(__invalidate_device);
1015 :
1016 0 : void sync_bdevs(bool wait)
1017 : {
1018 0 : struct inode *inode, *old_inode = NULL;
1019 :
1020 0 : spin_lock(&blockdev_superblock->s_inode_list_lock);
1021 0 : list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
1022 0 : struct address_space *mapping = inode->i_mapping;
1023 : struct block_device *bdev;
1024 :
1025 0 : spin_lock(&inode->i_lock);
1026 0 : if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
1027 0 : mapping->nrpages == 0) {
1028 0 : spin_unlock(&inode->i_lock);
1029 0 : continue;
1030 : }
1031 0 : __iget(inode);
1032 0 : spin_unlock(&inode->i_lock);
1033 0 : spin_unlock(&blockdev_superblock->s_inode_list_lock);
1034 : /*
1035 : * We hold a reference to 'inode' so it couldn't have been
1036 : * removed from s_inodes list while we dropped the
1037 : * s_inode_list_lock We cannot iput the inode now as we can
1038 : * be holding the last reference and we cannot iput it under
1039 : * s_inode_list_lock. So we keep the reference and iput it
1040 : * later.
1041 : */
1042 0 : iput(old_inode);
1043 0 : old_inode = inode;
1044 0 : bdev = I_BDEV(inode);
1045 :
1046 0 : mutex_lock(&bdev->bd_disk->open_mutex);
1047 0 : if (!bdev->bd_openers) {
1048 : ; /* skip */
1049 0 : } else if (wait) {
1050 : /*
1051 : * We keep the error status of individual mapping so
1052 : * that applications can catch the writeback error using
1053 : * fsync(2). See filemap_fdatawait_keep_errors() for
1054 : * details.
1055 : */
1056 0 : filemap_fdatawait_keep_errors(inode->i_mapping);
1057 : } else {
1058 0 : filemap_fdatawrite(inode->i_mapping);
1059 : }
1060 0 : mutex_unlock(&bdev->bd_disk->open_mutex);
1061 :
1062 0 : spin_lock(&blockdev_superblock->s_inode_list_lock);
1063 : }
1064 0 : spin_unlock(&blockdev_superblock->s_inode_list_lock);
1065 0 : iput(old_inode);
1066 0 : }
|