Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/buffer.c
4 : *
5 : * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 : */
7 :
8 : /*
9 : * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 : *
11 : * Removed a lot of unnecessary code and simplified things now that
12 : * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 : *
14 : * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 : * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 : *
17 : * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 : *
19 : * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 : */
21 :
22 : #include <linux/kernel.h>
23 : #include <linux/sched/signal.h>
24 : #include <linux/syscalls.h>
25 : #include <linux/fs.h>
26 : #include <linux/iomap.h>
27 : #include <linux/mm.h>
28 : #include <linux/percpu.h>
29 : #include <linux/slab.h>
30 : #include <linux/capability.h>
31 : #include <linux/blkdev.h>
32 : #include <linux/file.h>
33 : #include <linux/quotaops.h>
34 : #include <linux/highmem.h>
35 : #include <linux/export.h>
36 : #include <linux/backing-dev.h>
37 : #include <linux/writeback.h>
38 : #include <linux/hash.h>
39 : #include <linux/suspend.h>
40 : #include <linux/buffer_head.h>
41 : #include <linux/task_io_accounting_ops.h>
42 : #include <linux/bio.h>
43 : #include <linux/cpu.h>
44 : #include <linux/bitops.h>
45 : #include <linux/mpage.h>
46 : #include <linux/bit_spinlock.h>
47 : #include <linux/pagevec.h>
48 : #include <linux/sched/mm.h>
49 : #include <trace/events/block.h>
50 : #include <linux/fscrypt.h>
51 :
52 : #include "internal.h"
53 :
54 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
55 : static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
56 : struct writeback_control *wbc);
57 :
58 : #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
59 :
60 0 : inline void touch_buffer(struct buffer_head *bh)
61 : {
62 0 : trace_block_touch_buffer(bh);
63 0 : mark_page_accessed(bh->b_page);
64 0 : }
65 : EXPORT_SYMBOL(touch_buffer);
66 :
67 0 : void __lock_buffer(struct buffer_head *bh)
68 : {
69 0 : wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
70 0 : }
71 : EXPORT_SYMBOL(__lock_buffer);
72 :
73 0 : void unlock_buffer(struct buffer_head *bh)
74 : {
75 0 : clear_bit_unlock(BH_Lock, &bh->b_state);
76 0 : smp_mb__after_atomic();
77 0 : wake_up_bit(&bh->b_state, BH_Lock);
78 0 : }
79 : EXPORT_SYMBOL(unlock_buffer);
80 :
81 : /*
82 : * Returns if the page has dirty or writeback buffers. If all the buffers
83 : * are unlocked and clean then the PageDirty information is stale. If
84 : * any of the pages are locked, it is assumed they are locked for IO.
85 : */
86 0 : void buffer_check_dirty_writeback(struct page *page,
87 : bool *dirty, bool *writeback)
88 : {
89 : struct buffer_head *head, *bh;
90 0 : *dirty = false;
91 0 : *writeback = false;
92 :
93 0 : BUG_ON(!PageLocked(page));
94 :
95 0 : if (!page_has_buffers(page))
96 : return;
97 :
98 0 : if (PageWriteback(page))
99 0 : *writeback = true;
100 :
101 0 : head = page_buffers(page);
102 0 : bh = head;
103 : do {
104 0 : if (buffer_locked(bh))
105 0 : *writeback = true;
106 :
107 0 : if (buffer_dirty(bh))
108 0 : *dirty = true;
109 :
110 0 : bh = bh->b_this_page;
111 0 : } while (bh != head);
112 : }
113 : EXPORT_SYMBOL(buffer_check_dirty_writeback);
114 :
115 : /*
116 : * Block until a buffer comes unlocked. This doesn't stop it
117 : * from becoming locked again - you have to lock it yourself
118 : * if you want to preserve its state.
119 : */
120 0 : void __wait_on_buffer(struct buffer_head * bh)
121 : {
122 0 : wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
123 0 : }
124 : EXPORT_SYMBOL(__wait_on_buffer);
125 :
126 0 : static void buffer_io_error(struct buffer_head *bh, char *msg)
127 : {
128 0 : if (!test_bit(BH_Quiet, &bh->b_state))
129 0 : printk_ratelimited(KERN_ERR
130 : "Buffer I/O error on dev %pg, logical block %llu%s\n",
131 : bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
132 0 : }
133 :
134 : /*
135 : * End-of-IO handler helper function which does not touch the bh after
136 : * unlocking it.
137 : * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
138 : * a race there is benign: unlock_buffer() only use the bh's address for
139 : * hashing after unlocking the buffer, so it doesn't actually touch the bh
140 : * itself.
141 : */
142 0 : static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
143 : {
144 0 : if (uptodate) {
145 : set_buffer_uptodate(bh);
146 : } else {
147 : /* This happens, due to failed read-ahead attempts. */
148 : clear_buffer_uptodate(bh);
149 : }
150 0 : unlock_buffer(bh);
151 0 : }
152 :
153 : /*
154 : * Default synchronous end-of-IO handler.. Just mark it up-to-date and
155 : * unlock the buffer. This is what ll_rw_block uses too.
156 : */
157 0 : void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
158 : {
159 0 : __end_buffer_read_notouch(bh, uptodate);
160 0 : put_bh(bh);
161 0 : }
162 : EXPORT_SYMBOL(end_buffer_read_sync);
163 :
164 0 : void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
165 : {
166 0 : if (uptodate) {
167 : set_buffer_uptodate(bh);
168 : } else {
169 0 : buffer_io_error(bh, ", lost sync page write");
170 0 : mark_buffer_write_io_error(bh);
171 : clear_buffer_uptodate(bh);
172 : }
173 0 : unlock_buffer(bh);
174 0 : put_bh(bh);
175 0 : }
176 : EXPORT_SYMBOL(end_buffer_write_sync);
177 :
178 : /*
179 : * Various filesystems appear to want __find_get_block to be non-blocking.
180 : * But it's the page lock which protects the buffers. To get around this,
181 : * we get exclusion from try_to_free_buffers with the blockdev mapping's
182 : * private_lock.
183 : *
184 : * Hack idea: for the blockdev mapping, private_lock contention
185 : * may be quite high. This code could TryLock the page, and if that
186 : * succeeds, there is no need to take private_lock.
187 : */
188 : static struct buffer_head *
189 0 : __find_get_block_slow(struct block_device *bdev, sector_t block)
190 : {
191 0 : struct inode *bd_inode = bdev->bd_inode;
192 0 : struct address_space *bd_mapping = bd_inode->i_mapping;
193 0 : struct buffer_head *ret = NULL;
194 : pgoff_t index;
195 : struct buffer_head *bh;
196 : struct buffer_head *head;
197 : struct page *page;
198 0 : int all_mapped = 1;
199 : static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
200 :
201 0 : index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
202 0 : page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
203 0 : if (!page)
204 : goto out;
205 :
206 0 : spin_lock(&bd_mapping->private_lock);
207 0 : if (!page_has_buffers(page))
208 : goto out_unlock;
209 0 : head = page_buffers(page);
210 0 : bh = head;
211 : do {
212 0 : if (!buffer_mapped(bh))
213 : all_mapped = 0;
214 0 : else if (bh->b_blocknr == block) {
215 0 : ret = bh;
216 : get_bh(bh);
217 : goto out_unlock;
218 : }
219 0 : bh = bh->b_this_page;
220 0 : } while (bh != head);
221 :
222 : /* we might be here because some of the buffers on this page are
223 : * not mapped. This is due to various races between
224 : * file io on the block device and getblk. It gets dealt with
225 : * elsewhere, don't buffer_error if we had some unmapped buffers
226 : */
227 0 : ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
228 0 : if (all_mapped && __ratelimit(&last_warned)) {
229 0 : printk("__find_get_block_slow() failed. block=%llu, "
230 : "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
231 : "device %pg blocksize: %d\n",
232 : (unsigned long long)block,
233 : (unsigned long long)bh->b_blocknr,
234 : bh->b_state, bh->b_size, bdev,
235 : 1 << bd_inode->i_blkbits);
236 : }
237 : out_unlock:
238 0 : spin_unlock(&bd_mapping->private_lock);
239 0 : put_page(page);
240 : out:
241 0 : return ret;
242 : }
243 :
244 0 : static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
245 : {
246 : unsigned long flags;
247 : struct buffer_head *first;
248 : struct buffer_head *tmp;
249 : struct page *page;
250 0 : int page_uptodate = 1;
251 :
252 0 : BUG_ON(!buffer_async_read(bh));
253 :
254 0 : page = bh->b_page;
255 0 : if (uptodate) {
256 : set_buffer_uptodate(bh);
257 : } else {
258 0 : clear_buffer_uptodate(bh);
259 0 : buffer_io_error(bh, ", async page read");
260 : SetPageError(page);
261 : }
262 :
263 : /*
264 : * Be _very_ careful from here on. Bad things can happen if
265 : * two buffer heads end IO at almost the same time and both
266 : * decide that the page is now completely done.
267 : */
268 0 : first = page_buffers(page);
269 0 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
270 0 : clear_buffer_async_read(bh);
271 0 : unlock_buffer(bh);
272 0 : tmp = bh;
273 : do {
274 0 : if (!buffer_uptodate(tmp))
275 0 : page_uptodate = 0;
276 0 : if (buffer_async_read(tmp)) {
277 0 : BUG_ON(!buffer_locked(tmp));
278 : goto still_busy;
279 : }
280 0 : tmp = tmp->b_this_page;
281 0 : } while (tmp != bh);
282 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
283 :
284 : /*
285 : * If none of the buffers had errors and they are all
286 : * uptodate then we can set the page uptodate.
287 : */
288 0 : if (page_uptodate && !PageError(page))
289 : SetPageUptodate(page);
290 0 : unlock_page(page);
291 0 : return;
292 :
293 : still_busy:
294 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
295 : return;
296 : }
297 :
298 : struct decrypt_bh_ctx {
299 : struct work_struct work;
300 : struct buffer_head *bh;
301 : };
302 :
303 : static void decrypt_bh(struct work_struct *work)
304 : {
305 : struct decrypt_bh_ctx *ctx =
306 : container_of(work, struct decrypt_bh_ctx, work);
307 : struct buffer_head *bh = ctx->bh;
308 : int err;
309 :
310 : err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
311 : bh_offset(bh));
312 : end_buffer_async_read(bh, err == 0);
313 : kfree(ctx);
314 : }
315 :
316 : /*
317 : * I/O completion handler for block_read_full_page() - pages
318 : * which come unlocked at the end of I/O.
319 : */
320 0 : static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
321 : {
322 : /* Decrypt if needed */
323 : if (uptodate &&
324 : fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
325 : struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
326 :
327 : if (ctx) {
328 : INIT_WORK(&ctx->work, decrypt_bh);
329 : ctx->bh = bh;
330 : fscrypt_enqueue_decrypt_work(&ctx->work);
331 : return;
332 : }
333 : uptodate = 0;
334 : }
335 0 : end_buffer_async_read(bh, uptodate);
336 : }
337 :
338 : /*
339 : * Completion handler for block_write_full_page() - pages which are unlocked
340 : * during I/O, and which have PageWriteback cleared upon I/O completion.
341 : */
342 0 : void end_buffer_async_write(struct buffer_head *bh, int uptodate)
343 : {
344 : unsigned long flags;
345 : struct buffer_head *first;
346 : struct buffer_head *tmp;
347 : struct page *page;
348 :
349 0 : BUG_ON(!buffer_async_write(bh));
350 :
351 0 : page = bh->b_page;
352 0 : if (uptodate) {
353 : set_buffer_uptodate(bh);
354 : } else {
355 0 : buffer_io_error(bh, ", lost async page write");
356 0 : mark_buffer_write_io_error(bh);
357 0 : clear_buffer_uptodate(bh);
358 : SetPageError(page);
359 : }
360 :
361 0 : first = page_buffers(page);
362 0 : spin_lock_irqsave(&first->b_uptodate_lock, flags);
363 :
364 0 : clear_buffer_async_write(bh);
365 0 : unlock_buffer(bh);
366 0 : tmp = bh->b_this_page;
367 0 : while (tmp != bh) {
368 0 : if (buffer_async_write(tmp)) {
369 0 : BUG_ON(!buffer_locked(tmp));
370 : goto still_busy;
371 : }
372 0 : tmp = tmp->b_this_page;
373 : }
374 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
375 0 : end_page_writeback(page);
376 0 : return;
377 :
378 : still_busy:
379 0 : spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
380 : return;
381 : }
382 : EXPORT_SYMBOL(end_buffer_async_write);
383 :
384 : /*
385 : * If a page's buffers are under async readin (end_buffer_async_read
386 : * completion) then there is a possibility that another thread of
387 : * control could lock one of the buffers after it has completed
388 : * but while some of the other buffers have not completed. This
389 : * locked buffer would confuse end_buffer_async_read() into not unlocking
390 : * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
391 : * that this buffer is not under async I/O.
392 : *
393 : * The page comes unlocked when it has no locked buffer_async buffers
394 : * left.
395 : *
396 : * PageLocked prevents anyone starting new async I/O reads any of
397 : * the buffers.
398 : *
399 : * PageWriteback is used to prevent simultaneous writeout of the same
400 : * page.
401 : *
402 : * PageLocked prevents anyone from starting writeback of a page which is
403 : * under read I/O (PageWriteback is only ever set against a locked page).
404 : */
405 : static void mark_buffer_async_read(struct buffer_head *bh)
406 : {
407 0 : bh->b_end_io = end_buffer_async_read_io;
408 0 : set_buffer_async_read(bh);
409 : }
410 :
411 : static void mark_buffer_async_write_endio(struct buffer_head *bh,
412 : bh_end_io_t *handler)
413 : {
414 0 : bh->b_end_io = handler;
415 0 : set_buffer_async_write(bh);
416 : }
417 :
418 0 : void mark_buffer_async_write(struct buffer_head *bh)
419 : {
420 0 : mark_buffer_async_write_endio(bh, end_buffer_async_write);
421 0 : }
422 : EXPORT_SYMBOL(mark_buffer_async_write);
423 :
424 :
425 : /*
426 : * fs/buffer.c contains helper functions for buffer-backed address space's
427 : * fsync functions. A common requirement for buffer-based filesystems is
428 : * that certain data from the backing blockdev needs to be written out for
429 : * a successful fsync(). For example, ext2 indirect blocks need to be
430 : * written back and waited upon before fsync() returns.
431 : *
432 : * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
433 : * inode_has_buffers() and invalidate_inode_buffers() are provided for the
434 : * management of a list of dependent buffers at ->i_mapping->private_list.
435 : *
436 : * Locking is a little subtle: try_to_free_buffers() will remove buffers
437 : * from their controlling inode's queue when they are being freed. But
438 : * try_to_free_buffers() will be operating against the *blockdev* mapping
439 : * at the time, not against the S_ISREG file which depends on those buffers.
440 : * So the locking for private_list is via the private_lock in the address_space
441 : * which backs the buffers. Which is different from the address_space
442 : * against which the buffers are listed. So for a particular address_space,
443 : * mapping->private_lock does *not* protect mapping->private_list! In fact,
444 : * mapping->private_list will always be protected by the backing blockdev's
445 : * ->private_lock.
446 : *
447 : * Which introduces a requirement: all buffers on an address_space's
448 : * ->private_list must be from the same address_space: the blockdev's.
449 : *
450 : * address_spaces which do not place buffers at ->private_list via these
451 : * utility functions are free to use private_lock and private_list for
452 : * whatever they want. The only requirement is that list_empty(private_list)
453 : * be true at clear_inode() time.
454 : *
455 : * FIXME: clear_inode should not call invalidate_inode_buffers(). The
456 : * filesystems should do that. invalidate_inode_buffers() should just go
457 : * BUG_ON(!list_empty).
458 : *
459 : * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
460 : * take an address_space, not an inode. And it should be called
461 : * mark_buffer_dirty_fsync() to clearly define why those buffers are being
462 : * queued up.
463 : *
464 : * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
465 : * list if it is already on a list. Because if the buffer is on a list,
466 : * it *must* already be on the right one. If not, the filesystem is being
467 : * silly. This will save a ton of locking. But first we have to ensure
468 : * that buffers are taken *off* the old inode's list when they are freed
469 : * (presumably in truncate). That requires careful auditing of all
470 : * filesystems (do it inside bforget()). It could also be done by bringing
471 : * b_inode back.
472 : */
473 :
474 : /*
475 : * The buffer's backing address_space's private_lock must be held
476 : */
477 0 : static void __remove_assoc_queue(struct buffer_head *bh)
478 : {
479 0 : list_del_init(&bh->b_assoc_buffers);
480 0 : WARN_ON(!bh->b_assoc_map);
481 0 : bh->b_assoc_map = NULL;
482 0 : }
483 :
484 0 : int inode_has_buffers(struct inode *inode)
485 : {
486 0 : return !list_empty(&inode->i_data.private_list);
487 : }
488 :
489 : /*
490 : * osync is designed to support O_SYNC io. It waits synchronously for
491 : * all already-submitted IO to complete, but does not queue any new
492 : * writes to the disk.
493 : *
494 : * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
495 : * you dirty the buffers, and then use osync_inode_buffers to wait for
496 : * completion. Any other dirty buffers which are not yet queued for
497 : * write will not be flushed to disk by the osync.
498 : */
499 0 : static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
500 : {
501 : struct buffer_head *bh;
502 : struct list_head *p;
503 0 : int err = 0;
504 :
505 : spin_lock(lock);
506 : repeat:
507 0 : list_for_each_prev(p, list) {
508 0 : bh = BH_ENTRY(p);
509 0 : if (buffer_locked(bh)) {
510 0 : get_bh(bh);
511 0 : spin_unlock(lock);
512 0 : wait_on_buffer(bh);
513 0 : if (!buffer_uptodate(bh))
514 0 : err = -EIO;
515 0 : brelse(bh);
516 : spin_lock(lock);
517 : goto repeat;
518 : }
519 : }
520 0 : spin_unlock(lock);
521 0 : return err;
522 : }
523 :
524 0 : void emergency_thaw_bdev(struct super_block *sb)
525 : {
526 0 : while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
527 0 : printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
528 0 : }
529 :
530 : /**
531 : * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
532 : * @mapping: the mapping which wants those buffers written
533 : *
534 : * Starts I/O against the buffers at mapping->private_list, and waits upon
535 : * that I/O.
536 : *
537 : * Basically, this is a convenience function for fsync().
538 : * @mapping is a file or directory which needs those buffers to be written for
539 : * a successful fsync().
540 : */
541 0 : int sync_mapping_buffers(struct address_space *mapping)
542 : {
543 0 : struct address_space *buffer_mapping = mapping->private_data;
544 :
545 0 : if (buffer_mapping == NULL || list_empty(&mapping->private_list))
546 : return 0;
547 :
548 0 : return fsync_buffers_list(&buffer_mapping->private_lock,
549 : &mapping->private_list);
550 : }
551 : EXPORT_SYMBOL(sync_mapping_buffers);
552 :
553 : /*
554 : * Called when we've recently written block `bblock', and it is known that
555 : * `bblock' was for a buffer_boundary() buffer. This means that the block at
556 : * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
557 : * dirty, schedule it for IO. So that indirects merge nicely with their data.
558 : */
559 0 : void write_boundary_block(struct block_device *bdev,
560 : sector_t bblock, unsigned blocksize)
561 : {
562 0 : struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
563 0 : if (bh) {
564 0 : if (buffer_dirty(bh))
565 0 : ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
566 0 : put_bh(bh);
567 : }
568 0 : }
569 :
570 0 : void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
571 : {
572 0 : struct address_space *mapping = inode->i_mapping;
573 0 : struct address_space *buffer_mapping = bh->b_page->mapping;
574 :
575 0 : mark_buffer_dirty(bh);
576 0 : if (!mapping->private_data) {
577 0 : mapping->private_data = buffer_mapping;
578 : } else {
579 0 : BUG_ON(mapping->private_data != buffer_mapping);
580 : }
581 0 : if (!bh->b_assoc_map) {
582 0 : spin_lock(&buffer_mapping->private_lock);
583 0 : list_move_tail(&bh->b_assoc_buffers,
584 : &mapping->private_list);
585 0 : bh->b_assoc_map = mapping;
586 0 : spin_unlock(&buffer_mapping->private_lock);
587 : }
588 0 : }
589 : EXPORT_SYMBOL(mark_buffer_dirty_inode);
590 :
591 : /*
592 : * Add a page to the dirty page list.
593 : *
594 : * It is a sad fact of life that this function is called from several places
595 : * deeply under spinlocking. It may not sleep.
596 : *
597 : * If the page has buffers, the uptodate buffers are set dirty, to preserve
598 : * dirty-state coherency between the page and the buffers. It the page does
599 : * not have buffers then when they are later attached they will all be set
600 : * dirty.
601 : *
602 : * The buffers are dirtied before the page is dirtied. There's a small race
603 : * window in which a writepage caller may see the page cleanness but not the
604 : * buffer dirtiness. That's fine. If this code were to set the page dirty
605 : * before the buffers, a concurrent writepage caller could clear the page dirty
606 : * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
607 : * page on the dirty page list.
608 : *
609 : * We use private_lock to lock against try_to_free_buffers while using the
610 : * page's buffer list. Also use this to protect against clean buffers being
611 : * added to the page after it was set dirty.
612 : *
613 : * FIXME: may need to call ->reservepage here as well. That's rather up to the
614 : * address_space though.
615 : */
616 0 : bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
617 : {
618 : struct buffer_head *head;
619 : bool newly_dirty;
620 :
621 0 : spin_lock(&mapping->private_lock);
622 0 : head = folio_buffers(folio);
623 0 : if (head) {
624 : struct buffer_head *bh = head;
625 :
626 : do {
627 0 : set_buffer_dirty(bh);
628 0 : bh = bh->b_this_page;
629 0 : } while (bh != head);
630 : }
631 : /*
632 : * Lock out page's memcg migration to keep PageDirty
633 : * synchronized with per-memcg dirty page counters.
634 : */
635 0 : folio_memcg_lock(folio);
636 0 : newly_dirty = !folio_test_set_dirty(folio);
637 0 : spin_unlock(&mapping->private_lock);
638 :
639 0 : if (newly_dirty)
640 0 : __folio_mark_dirty(folio, mapping, 1);
641 :
642 0 : folio_memcg_unlock(folio);
643 :
644 0 : if (newly_dirty)
645 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
646 :
647 0 : return newly_dirty;
648 : }
649 : EXPORT_SYMBOL(block_dirty_folio);
650 :
651 : /*
652 : * Write out and wait upon a list of buffers.
653 : *
654 : * We have conflicting pressures: we want to make sure that all
655 : * initially dirty buffers get waited on, but that any subsequently
656 : * dirtied buffers don't. After all, we don't want fsync to last
657 : * forever if somebody is actively writing to the file.
658 : *
659 : * Do this in two main stages: first we copy dirty buffers to a
660 : * temporary inode list, queueing the writes as we go. Then we clean
661 : * up, waiting for those writes to complete.
662 : *
663 : * During this second stage, any subsequent updates to the file may end
664 : * up refiling the buffer on the original inode's dirty list again, so
665 : * there is a chance we will end up with a buffer queued for write but
666 : * not yet completed on that list. So, as a final cleanup we go through
667 : * the osync code to catch these locked, dirty buffers without requeuing
668 : * any newly dirty buffers for write.
669 : */
670 0 : static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
671 : {
672 : struct buffer_head *bh;
673 : struct list_head tmp;
674 : struct address_space *mapping;
675 0 : int err = 0, err2;
676 : struct blk_plug plug;
677 :
678 0 : INIT_LIST_HEAD(&tmp);
679 0 : blk_start_plug(&plug);
680 :
681 : spin_lock(lock);
682 0 : while (!list_empty(list)) {
683 0 : bh = BH_ENTRY(list->next);
684 0 : mapping = bh->b_assoc_map;
685 0 : __remove_assoc_queue(bh);
686 : /* Avoid race with mark_buffer_dirty_inode() which does
687 : * a lockless check and we rely on seeing the dirty bit */
688 0 : smp_mb();
689 0 : if (buffer_dirty(bh) || buffer_locked(bh)) {
690 0 : list_add(&bh->b_assoc_buffers, &tmp);
691 0 : bh->b_assoc_map = mapping;
692 0 : if (buffer_dirty(bh)) {
693 0 : get_bh(bh);
694 0 : spin_unlock(lock);
695 : /*
696 : * Ensure any pending I/O completes so that
697 : * write_dirty_buffer() actually writes the
698 : * current contents - it is a noop if I/O is
699 : * still in flight on potentially older
700 : * contents.
701 : */
702 0 : write_dirty_buffer(bh, REQ_SYNC);
703 :
704 : /*
705 : * Kick off IO for the previous mapping. Note
706 : * that we will not run the very last mapping,
707 : * wait_on_buffer() will do that for us
708 : * through sync_buffer().
709 : */
710 0 : brelse(bh);
711 : spin_lock(lock);
712 : }
713 : }
714 : }
715 :
716 0 : spin_unlock(lock);
717 0 : blk_finish_plug(&plug);
718 : spin_lock(lock);
719 :
720 0 : while (!list_empty(&tmp)) {
721 0 : bh = BH_ENTRY(tmp.prev);
722 0 : get_bh(bh);
723 0 : mapping = bh->b_assoc_map;
724 0 : __remove_assoc_queue(bh);
725 : /* Avoid race with mark_buffer_dirty_inode() which does
726 : * a lockless check and we rely on seeing the dirty bit */
727 0 : smp_mb();
728 0 : if (buffer_dirty(bh)) {
729 0 : list_add(&bh->b_assoc_buffers,
730 : &mapping->private_list);
731 0 : bh->b_assoc_map = mapping;
732 : }
733 0 : spin_unlock(lock);
734 0 : wait_on_buffer(bh);
735 0 : if (!buffer_uptodate(bh))
736 0 : err = -EIO;
737 0 : brelse(bh);
738 : spin_lock(lock);
739 : }
740 :
741 0 : spin_unlock(lock);
742 0 : err2 = osync_buffers_list(lock, list);
743 0 : if (err)
744 : return err;
745 : else
746 : return err2;
747 : }
748 :
749 : /*
750 : * Invalidate any and all dirty buffers on a given inode. We are
751 : * probably unmounting the fs, but that doesn't mean we have already
752 : * done a sync(). Just drop the buffers from the inode list.
753 : *
754 : * NOTE: we take the inode's blockdev's mapping's private_lock. Which
755 : * assumes that all the buffers are against the blockdev. Not true
756 : * for reiserfs.
757 : */
758 0 : void invalidate_inode_buffers(struct inode *inode)
759 : {
760 0 : if (inode_has_buffers(inode)) {
761 0 : struct address_space *mapping = &inode->i_data;
762 0 : struct list_head *list = &mapping->private_list;
763 0 : struct address_space *buffer_mapping = mapping->private_data;
764 :
765 0 : spin_lock(&buffer_mapping->private_lock);
766 0 : while (!list_empty(list))
767 0 : __remove_assoc_queue(BH_ENTRY(list->next));
768 0 : spin_unlock(&buffer_mapping->private_lock);
769 : }
770 0 : }
771 : EXPORT_SYMBOL(invalidate_inode_buffers);
772 :
773 : /*
774 : * Remove any clean buffers from the inode's buffer list. This is called
775 : * when we're trying to free the inode itself. Those buffers can pin it.
776 : *
777 : * Returns true if all buffers were removed.
778 : */
779 0 : int remove_inode_buffers(struct inode *inode)
780 : {
781 0 : int ret = 1;
782 :
783 0 : if (inode_has_buffers(inode)) {
784 0 : struct address_space *mapping = &inode->i_data;
785 0 : struct list_head *list = &mapping->private_list;
786 0 : struct address_space *buffer_mapping = mapping->private_data;
787 :
788 0 : spin_lock(&buffer_mapping->private_lock);
789 0 : while (!list_empty(list)) {
790 0 : struct buffer_head *bh = BH_ENTRY(list->next);
791 0 : if (buffer_dirty(bh)) {
792 : ret = 0;
793 : break;
794 : }
795 0 : __remove_assoc_queue(bh);
796 : }
797 0 : spin_unlock(&buffer_mapping->private_lock);
798 : }
799 0 : return ret;
800 : }
801 :
802 : /*
803 : * Create the appropriate buffers when given a page for data area and
804 : * the size of each buffer.. Use the bh->b_this_page linked list to
805 : * follow the buffers created. Return NULL if unable to create more
806 : * buffers.
807 : *
808 : * The retry flag is used to differentiate async IO (paging, swapping)
809 : * which may not fail from ordinary buffer allocations.
810 : */
811 0 : struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
812 : bool retry)
813 : {
814 : struct buffer_head *bh, *head;
815 0 : gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
816 : long offset;
817 : struct mem_cgroup *memcg, *old_memcg;
818 :
819 0 : if (retry)
820 0 : gfp |= __GFP_NOFAIL;
821 :
822 : /* The page lock pins the memcg */
823 0 : memcg = page_memcg(page);
824 0 : old_memcg = set_active_memcg(memcg);
825 :
826 0 : head = NULL;
827 0 : offset = PAGE_SIZE;
828 0 : while ((offset -= size) >= 0) {
829 0 : bh = alloc_buffer_head(gfp);
830 0 : if (!bh)
831 : goto no_grow;
832 :
833 0 : bh->b_this_page = head;
834 0 : bh->b_blocknr = -1;
835 0 : head = bh;
836 :
837 0 : bh->b_size = size;
838 :
839 : /* Link the buffer to its page */
840 0 : set_bh_page(bh, page, offset);
841 : }
842 : out:
843 : set_active_memcg(old_memcg);
844 0 : return head;
845 : /*
846 : * In case anything failed, we just free everything we got.
847 : */
848 : no_grow:
849 0 : if (head) {
850 : do {
851 0 : bh = head;
852 0 : head = head->b_this_page;
853 0 : free_buffer_head(bh);
854 0 : } while (head);
855 : }
856 :
857 : goto out;
858 : }
859 : EXPORT_SYMBOL_GPL(alloc_page_buffers);
860 :
861 : static inline void
862 : link_dev_buffers(struct page *page, struct buffer_head *head)
863 : {
864 : struct buffer_head *bh, *tail;
865 :
866 : bh = head;
867 : do {
868 0 : tail = bh;
869 0 : bh = bh->b_this_page;
870 0 : } while (bh);
871 0 : tail->b_this_page = head;
872 0 : attach_page_private(page, head);
873 : }
874 :
875 : static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
876 : {
877 0 : sector_t retval = ~((sector_t)0);
878 0 : loff_t sz = bdev_nr_bytes(bdev);
879 :
880 0 : if (sz) {
881 0 : unsigned int sizebits = blksize_bits(size);
882 0 : retval = (sz >> sizebits);
883 : }
884 : return retval;
885 : }
886 :
887 : /*
888 : * Initialise the state of a blockdev page's buffers.
889 : */
890 : static sector_t
891 0 : init_page_buffers(struct page *page, struct block_device *bdev,
892 : sector_t block, int size)
893 : {
894 0 : struct buffer_head *head = page_buffers(page);
895 0 : struct buffer_head *bh = head;
896 0 : int uptodate = PageUptodate(page);
897 0 : sector_t end_block = blkdev_max_block(bdev, size);
898 :
899 : do {
900 0 : if (!buffer_mapped(bh)) {
901 0 : bh->b_end_io = NULL;
902 0 : bh->b_private = NULL;
903 0 : bh->b_bdev = bdev;
904 0 : bh->b_blocknr = block;
905 0 : if (uptodate)
906 : set_buffer_uptodate(bh);
907 0 : if (block < end_block)
908 : set_buffer_mapped(bh);
909 : }
910 0 : block++;
911 0 : bh = bh->b_this_page;
912 0 : } while (bh != head);
913 :
914 : /*
915 : * Caller needs to validate requested block against end of device.
916 : */
917 0 : return end_block;
918 : }
919 :
920 : /*
921 : * Create the page-cache page that contains the requested block.
922 : *
923 : * This is used purely for blockdev mappings.
924 : */
925 : static int
926 0 : grow_dev_page(struct block_device *bdev, sector_t block,
927 : pgoff_t index, int size, int sizebits, gfp_t gfp)
928 : {
929 0 : struct inode *inode = bdev->bd_inode;
930 : struct page *page;
931 : struct buffer_head *bh;
932 : sector_t end_block;
933 0 : int ret = 0;
934 : gfp_t gfp_mask;
935 :
936 0 : gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
937 :
938 : /*
939 : * XXX: __getblk_slow() can not really deal with failure and
940 : * will endlessly loop on improvised global reclaim. Prefer
941 : * looping in the allocator rather than here, at least that
942 : * code knows what it's doing.
943 : */
944 0 : gfp_mask |= __GFP_NOFAIL;
945 :
946 0 : page = find_or_create_page(inode->i_mapping, index, gfp_mask);
947 :
948 0 : BUG_ON(!PageLocked(page));
949 :
950 0 : if (page_has_buffers(page)) {
951 0 : bh = page_buffers(page);
952 0 : if (bh->b_size == size) {
953 0 : end_block = init_page_buffers(page, bdev,
954 : (sector_t)index << sizebits,
955 : size);
956 0 : goto done;
957 : }
958 0 : if (!try_to_free_buffers(page))
959 : goto failed;
960 : }
961 :
962 : /*
963 : * Allocate some buffers for this page
964 : */
965 0 : bh = alloc_page_buffers(page, size, true);
966 :
967 : /*
968 : * Link the page to the buffers and initialise them. Take the
969 : * lock to be atomic wrt __find_get_block(), which does not
970 : * run under the page lock.
971 : */
972 0 : spin_lock(&inode->i_mapping->private_lock);
973 0 : link_dev_buffers(page, bh);
974 0 : end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
975 : size);
976 0 : spin_unlock(&inode->i_mapping->private_lock);
977 : done:
978 0 : ret = (block < end_block) ? 1 : -ENXIO;
979 : failed:
980 0 : unlock_page(page);
981 0 : put_page(page);
982 0 : return ret;
983 : }
984 :
985 : /*
986 : * Create buffers for the specified block device block's page. If
987 : * that page was dirty, the buffers are set dirty also.
988 : */
989 : static int
990 : grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
991 : {
992 : pgoff_t index;
993 : int sizebits;
994 :
995 0 : sizebits = PAGE_SHIFT - __ffs(size);
996 0 : index = block >> sizebits;
997 :
998 : /*
999 : * Check for a block which wants to lie outside our maximum possible
1000 : * pagecache index. (this comparison is done using sector_t types).
1001 : */
1002 : if (unlikely(index != block >> sizebits)) {
1003 : printk(KERN_ERR "%s: requested out-of-range block %llu for "
1004 : "device %pg\n",
1005 : __func__, (unsigned long long)block,
1006 : bdev);
1007 : return -EIO;
1008 : }
1009 :
1010 : /* Create a page with the proper size buffers.. */
1011 0 : return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1012 : }
1013 :
1014 : static struct buffer_head *
1015 0 : __getblk_slow(struct block_device *bdev, sector_t block,
1016 : unsigned size, gfp_t gfp)
1017 : {
1018 : /* Size must be multiple of hard sectorsize */
1019 0 : if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1020 : (size < 512 || size > PAGE_SIZE))) {
1021 0 : printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1022 : size);
1023 0 : printk(KERN_ERR "logical block size: %d\n",
1024 : bdev_logical_block_size(bdev));
1025 :
1026 0 : dump_stack();
1027 0 : return NULL;
1028 : }
1029 :
1030 : for (;;) {
1031 : struct buffer_head *bh;
1032 : int ret;
1033 :
1034 0 : bh = __find_get_block(bdev, block, size);
1035 0 : if (bh)
1036 : return bh;
1037 :
1038 0 : ret = grow_buffers(bdev, block, size, gfp);
1039 0 : if (ret < 0)
1040 : return NULL;
1041 : }
1042 : }
1043 :
1044 : /*
1045 : * The relationship between dirty buffers and dirty pages:
1046 : *
1047 : * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1048 : * the page is tagged dirty in the page cache.
1049 : *
1050 : * At all times, the dirtiness of the buffers represents the dirtiness of
1051 : * subsections of the page. If the page has buffers, the page dirty bit is
1052 : * merely a hint about the true dirty state.
1053 : *
1054 : * When a page is set dirty in its entirety, all its buffers are marked dirty
1055 : * (if the page has buffers).
1056 : *
1057 : * When a buffer is marked dirty, its page is dirtied, but the page's other
1058 : * buffers are not.
1059 : *
1060 : * Also. When blockdev buffers are explicitly read with bread(), they
1061 : * individually become uptodate. But their backing page remains not
1062 : * uptodate - even if all of its buffers are uptodate. A subsequent
1063 : * block_read_full_page() against that page will discover all the uptodate
1064 : * buffers, will set the page uptodate and will perform no I/O.
1065 : */
1066 :
1067 : /**
1068 : * mark_buffer_dirty - mark a buffer_head as needing writeout
1069 : * @bh: the buffer_head to mark dirty
1070 : *
1071 : * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1072 : * its backing page dirty, then tag the page as dirty in the page cache
1073 : * and then attach the address_space's inode to its superblock's dirty
1074 : * inode list.
1075 : *
1076 : * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1077 : * i_pages lock and mapping->host->i_lock.
1078 : */
1079 0 : void mark_buffer_dirty(struct buffer_head *bh)
1080 : {
1081 0 : WARN_ON_ONCE(!buffer_uptodate(bh));
1082 :
1083 0 : trace_block_dirty_buffer(bh);
1084 :
1085 : /*
1086 : * Very *carefully* optimize the it-is-already-dirty case.
1087 : *
1088 : * Don't let the final "is it dirty" escape to before we
1089 : * perhaps modified the buffer.
1090 : */
1091 0 : if (buffer_dirty(bh)) {
1092 0 : smp_mb();
1093 0 : if (buffer_dirty(bh))
1094 : return;
1095 : }
1096 :
1097 0 : if (!test_set_buffer_dirty(bh)) {
1098 0 : struct page *page = bh->b_page;
1099 0 : struct address_space *mapping = NULL;
1100 :
1101 0 : lock_page_memcg(page);
1102 0 : if (!TestSetPageDirty(page)) {
1103 0 : mapping = page_mapping(page);
1104 0 : if (mapping)
1105 : __set_page_dirty(page, mapping, 0);
1106 : }
1107 0 : unlock_page_memcg(page);
1108 0 : if (mapping)
1109 0 : __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1110 : }
1111 : }
1112 : EXPORT_SYMBOL(mark_buffer_dirty);
1113 :
1114 0 : void mark_buffer_write_io_error(struct buffer_head *bh)
1115 : {
1116 : struct super_block *sb;
1117 :
1118 0 : set_buffer_write_io_error(bh);
1119 : /* FIXME: do we need to set this in both places? */
1120 0 : if (bh->b_page && bh->b_page->mapping)
1121 0 : mapping_set_error(bh->b_page->mapping, -EIO);
1122 0 : if (bh->b_assoc_map)
1123 0 : mapping_set_error(bh->b_assoc_map, -EIO);
1124 : rcu_read_lock();
1125 0 : sb = READ_ONCE(bh->b_bdev->bd_super);
1126 0 : if (sb)
1127 0 : errseq_set(&sb->s_wb_err, -EIO);
1128 : rcu_read_unlock();
1129 0 : }
1130 : EXPORT_SYMBOL(mark_buffer_write_io_error);
1131 :
1132 : /*
1133 : * Decrement a buffer_head's reference count. If all buffers against a page
1134 : * have zero reference count, are clean and unlocked, and if the page is clean
1135 : * and unlocked then try_to_free_buffers() may strip the buffers from the page
1136 : * in preparation for freeing it (sometimes, rarely, buffers are removed from
1137 : * a page but it ends up not being freed, and buffers may later be reattached).
1138 : */
1139 0 : void __brelse(struct buffer_head * buf)
1140 : {
1141 0 : if (atomic_read(&buf->b_count)) {
1142 : put_bh(buf);
1143 : return;
1144 : }
1145 0 : WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1146 : }
1147 : EXPORT_SYMBOL(__brelse);
1148 :
1149 : /*
1150 : * bforget() is like brelse(), except it discards any
1151 : * potentially dirty data.
1152 : */
1153 0 : void __bforget(struct buffer_head *bh)
1154 : {
1155 0 : clear_buffer_dirty(bh);
1156 0 : if (bh->b_assoc_map) {
1157 0 : struct address_space *buffer_mapping = bh->b_page->mapping;
1158 :
1159 0 : spin_lock(&buffer_mapping->private_lock);
1160 0 : list_del_init(&bh->b_assoc_buffers);
1161 0 : bh->b_assoc_map = NULL;
1162 0 : spin_unlock(&buffer_mapping->private_lock);
1163 : }
1164 0 : __brelse(bh);
1165 0 : }
1166 : EXPORT_SYMBOL(__bforget);
1167 :
1168 0 : static struct buffer_head *__bread_slow(struct buffer_head *bh)
1169 : {
1170 0 : lock_buffer(bh);
1171 0 : if (buffer_uptodate(bh)) {
1172 0 : unlock_buffer(bh);
1173 0 : return bh;
1174 : } else {
1175 0 : get_bh(bh);
1176 0 : bh->b_end_io = end_buffer_read_sync;
1177 0 : submit_bh(REQ_OP_READ, 0, bh);
1178 0 : wait_on_buffer(bh);
1179 0 : if (buffer_uptodate(bh))
1180 : return bh;
1181 : }
1182 0 : brelse(bh);
1183 0 : return NULL;
1184 : }
1185 :
1186 : /*
1187 : * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1188 : * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1189 : * refcount elevated by one when they're in an LRU. A buffer can only appear
1190 : * once in a particular CPU's LRU. A single buffer can be present in multiple
1191 : * CPU's LRUs at the same time.
1192 : *
1193 : * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1194 : * sb_find_get_block().
1195 : *
1196 : * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1197 : * a local interrupt disable for that.
1198 : */
1199 :
1200 : #define BH_LRU_SIZE 16
1201 :
1202 : struct bh_lru {
1203 : struct buffer_head *bhs[BH_LRU_SIZE];
1204 : };
1205 :
1206 : static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1207 :
1208 : #ifdef CONFIG_SMP
1209 : #define bh_lru_lock() local_irq_disable()
1210 : #define bh_lru_unlock() local_irq_enable()
1211 : #else
1212 : #define bh_lru_lock() preempt_disable()
1213 : #define bh_lru_unlock() preempt_enable()
1214 : #endif
1215 :
1216 0 : static inline void check_irqs_on(void)
1217 : {
1218 : #ifdef irqs_disabled
1219 0 : BUG_ON(irqs_disabled());
1220 : #endif
1221 0 : }
1222 :
1223 : /*
1224 : * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1225 : * inserted at the front, and the buffer_head at the back if any is evicted.
1226 : * Or, if already in the LRU it is moved to the front.
1227 : */
1228 0 : static void bh_lru_install(struct buffer_head *bh)
1229 : {
1230 0 : struct buffer_head *evictee = bh;
1231 : struct bh_lru *b;
1232 : int i;
1233 :
1234 0 : check_irqs_on();
1235 0 : bh_lru_lock();
1236 :
1237 : /*
1238 : * the refcount of buffer_head in bh_lru prevents dropping the
1239 : * attached page(i.e., try_to_free_buffers) so it could cause
1240 : * failing page migration.
1241 : * Skip putting upcoming bh into bh_lru until migration is done.
1242 : */
1243 0 : if (lru_cache_disabled()) {
1244 0 : bh_lru_unlock();
1245 0 : return;
1246 : }
1247 :
1248 : b = this_cpu_ptr(&bh_lrus);
1249 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1250 0 : swap(evictee, b->bhs[i]);
1251 0 : if (evictee == bh) {
1252 0 : bh_lru_unlock();
1253 0 : return;
1254 : }
1255 : }
1256 :
1257 0 : get_bh(bh);
1258 0 : bh_lru_unlock();
1259 0 : brelse(evictee);
1260 : }
1261 :
1262 : /*
1263 : * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1264 : */
1265 : static struct buffer_head *
1266 0 : lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1267 : {
1268 0 : struct buffer_head *ret = NULL;
1269 : unsigned int i;
1270 :
1271 0 : check_irqs_on();
1272 0 : bh_lru_lock();
1273 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1274 0 : struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1275 :
1276 0 : if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1277 0 : bh->b_size == size) {
1278 0 : if (i) {
1279 0 : while (i) {
1280 0 : __this_cpu_write(bh_lrus.bhs[i],
1281 : __this_cpu_read(bh_lrus.bhs[i - 1]));
1282 0 : i--;
1283 : }
1284 0 : __this_cpu_write(bh_lrus.bhs[0], bh);
1285 : }
1286 0 : get_bh(bh);
1287 0 : ret = bh;
1288 0 : break;
1289 : }
1290 : }
1291 0 : bh_lru_unlock();
1292 0 : return ret;
1293 : }
1294 :
1295 : /*
1296 : * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1297 : * it in the LRU and mark it as accessed. If it is not present then return
1298 : * NULL
1299 : */
1300 : struct buffer_head *
1301 0 : __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1302 : {
1303 0 : struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1304 :
1305 0 : if (bh == NULL) {
1306 : /* __find_get_block_slow will mark the page accessed */
1307 0 : bh = __find_get_block_slow(bdev, block);
1308 0 : if (bh)
1309 0 : bh_lru_install(bh);
1310 : } else
1311 : touch_buffer(bh);
1312 :
1313 0 : return bh;
1314 : }
1315 : EXPORT_SYMBOL(__find_get_block);
1316 :
1317 : /*
1318 : * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1319 : * which corresponds to the passed block_device, block and size. The
1320 : * returned buffer has its reference count incremented.
1321 : *
1322 : * __getblk_gfp() will lock up the machine if grow_dev_page's
1323 : * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1324 : */
1325 : struct buffer_head *
1326 0 : __getblk_gfp(struct block_device *bdev, sector_t block,
1327 : unsigned size, gfp_t gfp)
1328 : {
1329 0 : struct buffer_head *bh = __find_get_block(bdev, block, size);
1330 :
1331 : might_sleep();
1332 0 : if (bh == NULL)
1333 0 : bh = __getblk_slow(bdev, block, size, gfp);
1334 0 : return bh;
1335 : }
1336 : EXPORT_SYMBOL(__getblk_gfp);
1337 :
1338 : /*
1339 : * Do async read-ahead on a buffer..
1340 : */
1341 0 : void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1342 : {
1343 0 : struct buffer_head *bh = __getblk(bdev, block, size);
1344 0 : if (likely(bh)) {
1345 0 : ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1346 0 : brelse(bh);
1347 : }
1348 0 : }
1349 : EXPORT_SYMBOL(__breadahead);
1350 :
1351 0 : void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
1352 : gfp_t gfp)
1353 : {
1354 0 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1355 0 : if (likely(bh)) {
1356 0 : ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1357 0 : brelse(bh);
1358 : }
1359 0 : }
1360 : EXPORT_SYMBOL(__breadahead_gfp);
1361 :
1362 : /**
1363 : * __bread_gfp() - reads a specified block and returns the bh
1364 : * @bdev: the block_device to read from
1365 : * @block: number of block
1366 : * @size: size (in bytes) to read
1367 : * @gfp: page allocation flag
1368 : *
1369 : * Reads a specified block, and returns buffer head that contains it.
1370 : * The page cache can be allocated from non-movable area
1371 : * not to prevent page migration if you set gfp to zero.
1372 : * It returns NULL if the block was unreadable.
1373 : */
1374 : struct buffer_head *
1375 0 : __bread_gfp(struct block_device *bdev, sector_t block,
1376 : unsigned size, gfp_t gfp)
1377 : {
1378 0 : struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1379 :
1380 0 : if (likely(bh) && !buffer_uptodate(bh))
1381 0 : bh = __bread_slow(bh);
1382 0 : return bh;
1383 : }
1384 : EXPORT_SYMBOL(__bread_gfp);
1385 :
1386 : static void __invalidate_bh_lrus(struct bh_lru *b)
1387 : {
1388 : int i;
1389 :
1390 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1391 0 : brelse(b->bhs[i]);
1392 0 : b->bhs[i] = NULL;
1393 : }
1394 : }
1395 : /*
1396 : * invalidate_bh_lrus() is called rarely - but not only at unmount.
1397 : * This doesn't race because it runs in each cpu either in irq
1398 : * or with preempt disabled.
1399 : */
1400 0 : static void invalidate_bh_lru(void *arg)
1401 : {
1402 0 : struct bh_lru *b = &get_cpu_var(bh_lrus);
1403 :
1404 0 : __invalidate_bh_lrus(b);
1405 0 : put_cpu_var(bh_lrus);
1406 0 : }
1407 :
1408 0 : bool has_bh_in_lru(int cpu, void *dummy)
1409 : {
1410 0 : struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1411 : int i;
1412 :
1413 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
1414 0 : if (b->bhs[i])
1415 : return true;
1416 : }
1417 :
1418 : return false;
1419 : }
1420 :
1421 0 : void invalidate_bh_lrus(void)
1422 : {
1423 0 : on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1424 0 : }
1425 : EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1426 :
1427 : /*
1428 : * It's called from workqueue context so we need a bh_lru_lock to close
1429 : * the race with preemption/irq.
1430 : */
1431 0 : void invalidate_bh_lrus_cpu(void)
1432 : {
1433 : struct bh_lru *b;
1434 :
1435 0 : bh_lru_lock();
1436 0 : b = this_cpu_ptr(&bh_lrus);
1437 0 : __invalidate_bh_lrus(b);
1438 0 : bh_lru_unlock();
1439 0 : }
1440 :
1441 0 : void set_bh_page(struct buffer_head *bh,
1442 : struct page *page, unsigned long offset)
1443 : {
1444 0 : bh->b_page = page;
1445 0 : BUG_ON(offset >= PAGE_SIZE);
1446 0 : if (PageHighMem(page))
1447 : /*
1448 : * This catches illegal uses and preserves the offset:
1449 : */
1450 : bh->b_data = (char *)(0 + offset);
1451 : else
1452 0 : bh->b_data = page_address(page) + offset;
1453 0 : }
1454 : EXPORT_SYMBOL(set_bh_page);
1455 :
1456 : /*
1457 : * Called when truncating a buffer on a page completely.
1458 : */
1459 :
1460 : /* Bits that are cleared during an invalidate */
1461 : #define BUFFER_FLAGS_DISCARD \
1462 : (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1463 : 1 << BH_Delay | 1 << BH_Unwritten)
1464 :
1465 0 : static void discard_buffer(struct buffer_head * bh)
1466 : {
1467 : unsigned long b_state, b_state_old;
1468 :
1469 0 : lock_buffer(bh);
1470 0 : clear_buffer_dirty(bh);
1471 0 : bh->b_bdev = NULL;
1472 0 : b_state = bh->b_state;
1473 : for (;;) {
1474 0 : b_state_old = cmpxchg(&bh->b_state, b_state,
1475 : (b_state & ~BUFFER_FLAGS_DISCARD));
1476 0 : if (b_state_old == b_state)
1477 : break;
1478 : b_state = b_state_old;
1479 : }
1480 0 : unlock_buffer(bh);
1481 0 : }
1482 :
1483 : /**
1484 : * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1485 : * @folio: The folio which is affected.
1486 : * @offset: start of the range to invalidate
1487 : * @length: length of the range to invalidate
1488 : *
1489 : * block_invalidate_folio() is called when all or part of the folio has been
1490 : * invalidated by a truncate operation.
1491 : *
1492 : * block_invalidate_folio() does not have to release all buffers, but it must
1493 : * ensure that no dirty buffer is left outside @offset and that no I/O
1494 : * is underway against any of the blocks which are outside the truncation
1495 : * point. Because the caller is about to free (and possibly reuse) those
1496 : * blocks on-disk.
1497 : */
1498 0 : void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1499 : {
1500 : struct buffer_head *head, *bh, *next;
1501 0 : size_t curr_off = 0;
1502 0 : size_t stop = length + offset;
1503 :
1504 0 : BUG_ON(!folio_test_locked(folio));
1505 :
1506 : /*
1507 : * Check for overflow
1508 : */
1509 0 : BUG_ON(stop > folio_size(folio) || stop < length);
1510 :
1511 0 : head = folio_buffers(folio);
1512 0 : if (!head)
1513 : return;
1514 :
1515 : bh = head;
1516 : do {
1517 0 : size_t next_off = curr_off + bh->b_size;
1518 0 : next = bh->b_this_page;
1519 :
1520 : /*
1521 : * Are we still fully in range ?
1522 : */
1523 0 : if (next_off > stop)
1524 : goto out;
1525 :
1526 : /*
1527 : * is this block fully invalidated?
1528 : */
1529 0 : if (offset <= curr_off)
1530 0 : discard_buffer(bh);
1531 0 : curr_off = next_off;
1532 0 : bh = next;
1533 0 : } while (bh != head);
1534 :
1535 : /*
1536 : * We release buffers only if the entire folio is being invalidated.
1537 : * The get_block cached value has been unconditionally invalidated,
1538 : * so real IO is not possible anymore.
1539 : */
1540 0 : if (length == folio_size(folio))
1541 0 : filemap_release_folio(folio, 0);
1542 : out:
1543 : return;
1544 : }
1545 : EXPORT_SYMBOL(block_invalidate_folio);
1546 :
1547 :
1548 : /*
1549 : * We attach and possibly dirty the buffers atomically wrt
1550 : * block_dirty_folio() via private_lock. try_to_free_buffers
1551 : * is already excluded via the page lock.
1552 : */
1553 0 : void create_empty_buffers(struct page *page,
1554 : unsigned long blocksize, unsigned long b_state)
1555 : {
1556 : struct buffer_head *bh, *head, *tail;
1557 :
1558 0 : head = alloc_page_buffers(page, blocksize, true);
1559 0 : bh = head;
1560 : do {
1561 0 : bh->b_state |= b_state;
1562 0 : tail = bh;
1563 0 : bh = bh->b_this_page;
1564 0 : } while (bh);
1565 0 : tail->b_this_page = head;
1566 :
1567 0 : spin_lock(&page->mapping->private_lock);
1568 0 : if (PageUptodate(page) || PageDirty(page)) {
1569 : bh = head;
1570 : do {
1571 0 : if (PageDirty(page))
1572 : set_buffer_dirty(bh);
1573 0 : if (PageUptodate(page))
1574 : set_buffer_uptodate(bh);
1575 0 : bh = bh->b_this_page;
1576 0 : } while (bh != head);
1577 : }
1578 0 : attach_page_private(page, head);
1579 0 : spin_unlock(&page->mapping->private_lock);
1580 0 : }
1581 : EXPORT_SYMBOL(create_empty_buffers);
1582 :
1583 : /**
1584 : * clean_bdev_aliases: clean a range of buffers in block device
1585 : * @bdev: Block device to clean buffers in
1586 : * @block: Start of a range of blocks to clean
1587 : * @len: Number of blocks to clean
1588 : *
1589 : * We are taking a range of blocks for data and we don't want writeback of any
1590 : * buffer-cache aliases starting from return from this function and until the
1591 : * moment when something will explicitly mark the buffer dirty (hopefully that
1592 : * will not happen until we will free that block ;-) We don't even need to mark
1593 : * it not-uptodate - nobody can expect anything from a newly allocated buffer
1594 : * anyway. We used to use unmap_buffer() for such invalidation, but that was
1595 : * wrong. We definitely don't want to mark the alias unmapped, for example - it
1596 : * would confuse anyone who might pick it with bread() afterwards...
1597 : *
1598 : * Also.. Note that bforget() doesn't lock the buffer. So there can be
1599 : * writeout I/O going on against recently-freed buffers. We don't wait on that
1600 : * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1601 : * need to. That happens here.
1602 : */
1603 0 : void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1604 : {
1605 0 : struct inode *bd_inode = bdev->bd_inode;
1606 0 : struct address_space *bd_mapping = bd_inode->i_mapping;
1607 : struct pagevec pvec;
1608 0 : pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1609 : pgoff_t end;
1610 : int i, count;
1611 : struct buffer_head *bh;
1612 : struct buffer_head *head;
1613 :
1614 0 : end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1615 0 : pagevec_init(&pvec);
1616 0 : while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
1617 0 : count = pagevec_count(&pvec);
1618 0 : for (i = 0; i < count; i++) {
1619 0 : struct page *page = pvec.pages[i];
1620 :
1621 0 : if (!page_has_buffers(page))
1622 0 : continue;
1623 : /*
1624 : * We use page lock instead of bd_mapping->private_lock
1625 : * to pin buffers here since we can afford to sleep and
1626 : * it scales better than a global spinlock lock.
1627 : */
1628 0 : lock_page(page);
1629 : /* Recheck when the page is locked which pins bhs */
1630 0 : if (!page_has_buffers(page))
1631 : goto unlock_page;
1632 0 : head = page_buffers(page);
1633 0 : bh = head;
1634 : do {
1635 0 : if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1636 : goto next;
1637 0 : if (bh->b_blocknr >= block + len)
1638 : break;
1639 0 : clear_buffer_dirty(bh);
1640 0 : wait_on_buffer(bh);
1641 : clear_buffer_req(bh);
1642 : next:
1643 0 : bh = bh->b_this_page;
1644 0 : } while (bh != head);
1645 : unlock_page:
1646 0 : unlock_page(page);
1647 : }
1648 0 : pagevec_release(&pvec);
1649 0 : cond_resched();
1650 : /* End of range already reached? */
1651 0 : if (index > end || !index)
1652 : break;
1653 : }
1654 0 : }
1655 : EXPORT_SYMBOL(clean_bdev_aliases);
1656 :
1657 : /*
1658 : * Size is a power-of-two in the range 512..PAGE_SIZE,
1659 : * and the case we care about most is PAGE_SIZE.
1660 : *
1661 : * So this *could* possibly be written with those
1662 : * constraints in mind (relevant mostly if some
1663 : * architecture has a slow bit-scan instruction)
1664 : */
1665 : static inline int block_size_bits(unsigned int blocksize)
1666 : {
1667 0 : return ilog2(blocksize);
1668 : }
1669 :
1670 0 : static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1671 : {
1672 0 : BUG_ON(!PageLocked(page));
1673 :
1674 0 : if (!page_has_buffers(page))
1675 0 : create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
1676 : b_state);
1677 0 : return page_buffers(page);
1678 : }
1679 :
1680 : /*
1681 : * NOTE! All mapped/uptodate combinations are valid:
1682 : *
1683 : * Mapped Uptodate Meaning
1684 : *
1685 : * No No "unknown" - must do get_block()
1686 : * No Yes "hole" - zero-filled
1687 : * Yes No "allocated" - allocated on disk, not read in
1688 : * Yes Yes "valid" - allocated and up-to-date in memory.
1689 : *
1690 : * "Dirty" is valid only with the last case (mapped+uptodate).
1691 : */
1692 :
1693 : /*
1694 : * While block_write_full_page is writing back the dirty buffers under
1695 : * the page lock, whoever dirtied the buffers may decide to clean them
1696 : * again at any time. We handle that by only looking at the buffer
1697 : * state inside lock_buffer().
1698 : *
1699 : * If block_write_full_page() is called for regular writeback
1700 : * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1701 : * locked buffer. This only can happen if someone has written the buffer
1702 : * directly, with submit_bh(). At the address_space level PageWriteback
1703 : * prevents this contention from occurring.
1704 : *
1705 : * If block_write_full_page() is called with wbc->sync_mode ==
1706 : * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1707 : * causes the writes to be flagged as synchronous writes.
1708 : */
1709 0 : int __block_write_full_page(struct inode *inode, struct page *page,
1710 : get_block_t *get_block, struct writeback_control *wbc,
1711 : bh_end_io_t *handler)
1712 : {
1713 : int err;
1714 : sector_t block;
1715 : sector_t last_block;
1716 : struct buffer_head *bh, *head;
1717 : unsigned int blocksize, bbits;
1718 0 : int nr_underway = 0;
1719 0 : int write_flags = wbc_to_write_flags(wbc);
1720 :
1721 0 : head = create_page_buffers(page, inode,
1722 : (1 << BH_Dirty)|(1 << BH_Uptodate));
1723 :
1724 : /*
1725 : * Be very careful. We have no exclusion from block_dirty_folio
1726 : * here, and the (potentially unmapped) buffers may become dirty at
1727 : * any time. If a buffer becomes dirty here after we've inspected it
1728 : * then we just miss that fact, and the page stays dirty.
1729 : *
1730 : * Buffers outside i_size may be dirtied by block_dirty_folio;
1731 : * handle that here by just cleaning them.
1732 : */
1733 :
1734 0 : bh = head;
1735 0 : blocksize = bh->b_size;
1736 0 : bbits = block_size_bits(blocksize);
1737 :
1738 0 : block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1739 0 : last_block = (i_size_read(inode) - 1) >> bbits;
1740 :
1741 : /*
1742 : * Get all the dirty buffers mapped to disk addresses and
1743 : * handle any aliases from the underlying blockdev's mapping.
1744 : */
1745 : do {
1746 0 : if (block > last_block) {
1747 : /*
1748 : * mapped buffers outside i_size will occur, because
1749 : * this page can be outside i_size when there is a
1750 : * truncate in progress.
1751 : */
1752 : /*
1753 : * The buffer was zeroed by block_write_full_page()
1754 : */
1755 0 : clear_buffer_dirty(bh);
1756 : set_buffer_uptodate(bh);
1757 0 : } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1758 0 : buffer_dirty(bh)) {
1759 0 : WARN_ON(bh->b_size != blocksize);
1760 0 : err = get_block(inode, block, bh, 1);
1761 0 : if (err)
1762 : goto recover;
1763 0 : clear_buffer_delay(bh);
1764 0 : if (buffer_new(bh)) {
1765 : /* blockdev mappings never come here */
1766 0 : clear_buffer_new(bh);
1767 0 : clean_bdev_bh_alias(bh);
1768 : }
1769 : }
1770 0 : bh = bh->b_this_page;
1771 0 : block++;
1772 0 : } while (bh != head);
1773 :
1774 : do {
1775 0 : if (!buffer_mapped(bh))
1776 0 : continue;
1777 : /*
1778 : * If it's a fully non-blocking write attempt and we cannot
1779 : * lock the buffer then redirty the page. Note that this can
1780 : * potentially cause a busy-wait loop from writeback threads
1781 : * and kswapd activity, but those code paths have their own
1782 : * higher-level throttling.
1783 : */
1784 0 : if (wbc->sync_mode != WB_SYNC_NONE) {
1785 : lock_buffer(bh);
1786 0 : } else if (!trylock_buffer(bh)) {
1787 0 : redirty_page_for_writepage(wbc, page);
1788 0 : continue;
1789 : }
1790 0 : if (test_clear_buffer_dirty(bh)) {
1791 : mark_buffer_async_write_endio(bh, handler);
1792 : } else {
1793 : unlock_buffer(bh);
1794 : }
1795 0 : } while ((bh = bh->b_this_page) != head);
1796 :
1797 : /*
1798 : * The page and its buffers are protected by PageWriteback(), so we can
1799 : * drop the bh refcounts early.
1800 : */
1801 0 : BUG_ON(PageWriteback(page));
1802 0 : set_page_writeback(page);
1803 :
1804 : do {
1805 0 : struct buffer_head *next = bh->b_this_page;
1806 0 : if (buffer_async_write(bh)) {
1807 0 : submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
1808 0 : nr_underway++;
1809 : }
1810 0 : bh = next;
1811 0 : } while (bh != head);
1812 0 : unlock_page(page);
1813 :
1814 0 : err = 0;
1815 : done:
1816 0 : if (nr_underway == 0) {
1817 : /*
1818 : * The page was marked dirty, but the buffers were
1819 : * clean. Someone wrote them back by hand with
1820 : * ll_rw_block/submit_bh. A rare case.
1821 : */
1822 0 : end_page_writeback(page);
1823 :
1824 : /*
1825 : * The page and buffer_heads can be released at any time from
1826 : * here on.
1827 : */
1828 : }
1829 0 : return err;
1830 :
1831 : recover:
1832 : /*
1833 : * ENOSPC, or some other error. We may already have added some
1834 : * blocks to the file, so we need to write these out to avoid
1835 : * exposing stale data.
1836 : * The page is currently locked and not marked for writeback
1837 : */
1838 : bh = head;
1839 : /* Recovery: lock and submit the mapped buffers */
1840 : do {
1841 0 : if (buffer_mapped(bh) && buffer_dirty(bh) &&
1842 0 : !buffer_delay(bh)) {
1843 0 : lock_buffer(bh);
1844 : mark_buffer_async_write_endio(bh, handler);
1845 : } else {
1846 : /*
1847 : * The buffer may have been set dirty during
1848 : * attachment to a dirty page.
1849 : */
1850 : clear_buffer_dirty(bh);
1851 : }
1852 0 : } while ((bh = bh->b_this_page) != head);
1853 0 : SetPageError(page);
1854 0 : BUG_ON(PageWriteback(page));
1855 0 : mapping_set_error(page->mapping, err);
1856 0 : set_page_writeback(page);
1857 : do {
1858 0 : struct buffer_head *next = bh->b_this_page;
1859 0 : if (buffer_async_write(bh)) {
1860 0 : clear_buffer_dirty(bh);
1861 0 : submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
1862 0 : nr_underway++;
1863 : }
1864 0 : bh = next;
1865 0 : } while (bh != head);
1866 0 : unlock_page(page);
1867 0 : goto done;
1868 : }
1869 : EXPORT_SYMBOL(__block_write_full_page);
1870 :
1871 : /*
1872 : * If a page has any new buffers, zero them out here, and mark them uptodate
1873 : * and dirty so they'll be written out (in order to prevent uninitialised
1874 : * block data from leaking). And clear the new bit.
1875 : */
1876 0 : void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1877 : {
1878 : unsigned int block_start, block_end;
1879 : struct buffer_head *head, *bh;
1880 :
1881 0 : BUG_ON(!PageLocked(page));
1882 0 : if (!page_has_buffers(page))
1883 : return;
1884 :
1885 0 : bh = head = page_buffers(page);
1886 0 : block_start = 0;
1887 : do {
1888 0 : block_end = block_start + bh->b_size;
1889 :
1890 0 : if (buffer_new(bh)) {
1891 0 : if (block_end > from && block_start < to) {
1892 0 : if (!PageUptodate(page)) {
1893 : unsigned start, size;
1894 :
1895 0 : start = max(from, block_start);
1896 0 : size = min(to, block_end) - start;
1897 :
1898 0 : zero_user(page, start, size);
1899 : set_buffer_uptodate(bh);
1900 : }
1901 :
1902 0 : clear_buffer_new(bh);
1903 0 : mark_buffer_dirty(bh);
1904 : }
1905 : }
1906 :
1907 0 : block_start = block_end;
1908 0 : bh = bh->b_this_page;
1909 0 : } while (bh != head);
1910 : }
1911 : EXPORT_SYMBOL(page_zero_new_buffers);
1912 :
1913 : static void
1914 0 : iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1915 : const struct iomap *iomap)
1916 : {
1917 0 : loff_t offset = block << inode->i_blkbits;
1918 :
1919 0 : bh->b_bdev = iomap->bdev;
1920 :
1921 : /*
1922 : * Block points to offset in file we need to map, iomap contains
1923 : * the offset at which the map starts. If the map ends before the
1924 : * current block, then do not map the buffer and let the caller
1925 : * handle it.
1926 : */
1927 0 : BUG_ON(offset >= iomap->offset + iomap->length);
1928 :
1929 0 : switch (iomap->type) {
1930 : case IOMAP_HOLE:
1931 : /*
1932 : * If the buffer is not up to date or beyond the current EOF,
1933 : * we need to mark it as new to ensure sub-block zeroing is
1934 : * executed if necessary.
1935 : */
1936 0 : if (!buffer_uptodate(bh) ||
1937 0 : (offset >= i_size_read(inode)))
1938 : set_buffer_new(bh);
1939 : break;
1940 : case IOMAP_DELALLOC:
1941 0 : if (!buffer_uptodate(bh) ||
1942 0 : (offset >= i_size_read(inode)))
1943 : set_buffer_new(bh);
1944 0 : set_buffer_uptodate(bh);
1945 0 : set_buffer_mapped(bh);
1946 : set_buffer_delay(bh);
1947 : break;
1948 : case IOMAP_UNWRITTEN:
1949 : /*
1950 : * For unwritten regions, we always need to ensure that regions
1951 : * in the block we are not writing to are zeroed. Mark the
1952 : * buffer as new to ensure this.
1953 : */
1954 0 : set_buffer_new(bh);
1955 : set_buffer_unwritten(bh);
1956 : fallthrough;
1957 : case IOMAP_MAPPED:
1958 0 : if ((iomap->flags & IOMAP_F_NEW) ||
1959 0 : offset >= i_size_read(inode))
1960 : set_buffer_new(bh);
1961 0 : bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
1962 0 : inode->i_blkbits;
1963 : set_buffer_mapped(bh);
1964 : break;
1965 : }
1966 0 : }
1967 :
1968 0 : int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
1969 : get_block_t *get_block, const struct iomap *iomap)
1970 : {
1971 0 : unsigned from = pos & (PAGE_SIZE - 1);
1972 0 : unsigned to = from + len;
1973 0 : struct inode *inode = folio->mapping->host;
1974 : unsigned block_start, block_end;
1975 : sector_t block;
1976 0 : int err = 0;
1977 : unsigned blocksize, bbits;
1978 0 : struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1979 :
1980 0 : BUG_ON(!folio_test_locked(folio));
1981 : BUG_ON(from > PAGE_SIZE);
1982 0 : BUG_ON(to > PAGE_SIZE);
1983 0 : BUG_ON(from > to);
1984 :
1985 0 : head = create_page_buffers(&folio->page, inode, 0);
1986 0 : blocksize = head->b_size;
1987 0 : bbits = block_size_bits(blocksize);
1988 :
1989 0 : block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
1990 :
1991 0 : for(bh = head, block_start = 0; bh != head || !block_start;
1992 0 : block++, block_start=block_end, bh = bh->b_this_page) {
1993 0 : block_end = block_start + blocksize;
1994 0 : if (block_end <= from || block_start >= to) {
1995 0 : if (folio_test_uptodate(folio)) {
1996 0 : if (!buffer_uptodate(bh))
1997 0 : set_buffer_uptodate(bh);
1998 : }
1999 0 : continue;
2000 : }
2001 0 : if (buffer_new(bh))
2002 0 : clear_buffer_new(bh);
2003 0 : if (!buffer_mapped(bh)) {
2004 0 : WARN_ON(bh->b_size != blocksize);
2005 0 : if (get_block) {
2006 0 : err = get_block(inode, block, bh, 1);
2007 0 : if (err)
2008 : break;
2009 : } else {
2010 0 : iomap_to_bh(inode, block, bh, iomap);
2011 : }
2012 :
2013 0 : if (buffer_new(bh)) {
2014 0 : clean_bdev_bh_alias(bh);
2015 0 : if (folio_test_uptodate(folio)) {
2016 0 : clear_buffer_new(bh);
2017 0 : set_buffer_uptodate(bh);
2018 0 : mark_buffer_dirty(bh);
2019 0 : continue;
2020 : }
2021 0 : if (block_end > to || block_start < from)
2022 0 : folio_zero_segments(folio,
2023 : to, block_end,
2024 : block_start, from);
2025 0 : continue;
2026 : }
2027 : }
2028 0 : if (folio_test_uptodate(folio)) {
2029 0 : if (!buffer_uptodate(bh))
2030 0 : set_buffer_uptodate(bh);
2031 0 : continue;
2032 : }
2033 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2034 0 : !buffer_unwritten(bh) &&
2035 0 : (block_start < from || block_end > to)) {
2036 0 : ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2037 0 : *wait_bh++=bh;
2038 : }
2039 : }
2040 : /*
2041 : * If we issued read requests - let them complete.
2042 : */
2043 0 : while(wait_bh > wait) {
2044 0 : wait_on_buffer(*--wait_bh);
2045 0 : if (!buffer_uptodate(*wait_bh))
2046 0 : err = -EIO;
2047 : }
2048 0 : if (unlikely(err))
2049 0 : page_zero_new_buffers(&folio->page, from, to);
2050 0 : return err;
2051 : }
2052 :
2053 0 : int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2054 : get_block_t *get_block)
2055 : {
2056 0 : return __block_write_begin_int(page_folio(page), pos, len, get_block,
2057 : NULL);
2058 : }
2059 : EXPORT_SYMBOL(__block_write_begin);
2060 :
2061 0 : static int __block_commit_write(struct inode *inode, struct page *page,
2062 : unsigned from, unsigned to)
2063 : {
2064 : unsigned block_start, block_end;
2065 0 : int partial = 0;
2066 : unsigned blocksize;
2067 : struct buffer_head *bh, *head;
2068 :
2069 0 : bh = head = page_buffers(page);
2070 0 : blocksize = bh->b_size;
2071 :
2072 0 : block_start = 0;
2073 : do {
2074 0 : block_end = block_start + blocksize;
2075 0 : if (block_end <= from || block_start >= to) {
2076 0 : if (!buffer_uptodate(bh))
2077 0 : partial = 1;
2078 : } else {
2079 0 : set_buffer_uptodate(bh);
2080 0 : mark_buffer_dirty(bh);
2081 : }
2082 0 : if (buffer_new(bh))
2083 : clear_buffer_new(bh);
2084 :
2085 0 : block_start = block_end;
2086 0 : bh = bh->b_this_page;
2087 0 : } while (bh != head);
2088 :
2089 : /*
2090 : * If this is a partial write which happened to make all buffers
2091 : * uptodate then we can optimize away a bogus readpage() for
2092 : * the next read(). Here we 'discover' whether the page went
2093 : * uptodate as a result of this (potentially partial) write.
2094 : */
2095 0 : if (!partial)
2096 : SetPageUptodate(page);
2097 0 : return 0;
2098 : }
2099 :
2100 : /*
2101 : * block_write_begin takes care of the basic task of block allocation and
2102 : * bringing partial write blocks uptodate first.
2103 : *
2104 : * The filesystem needs to handle block truncation upon failure.
2105 : */
2106 0 : int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2107 : unsigned flags, struct page **pagep, get_block_t *get_block)
2108 : {
2109 0 : pgoff_t index = pos >> PAGE_SHIFT;
2110 : struct page *page;
2111 : int status;
2112 :
2113 0 : page = grab_cache_page_write_begin(mapping, index, flags);
2114 0 : if (!page)
2115 : return -ENOMEM;
2116 :
2117 0 : status = __block_write_begin(page, pos, len, get_block);
2118 0 : if (unlikely(status)) {
2119 0 : unlock_page(page);
2120 0 : put_page(page);
2121 0 : page = NULL;
2122 : }
2123 :
2124 0 : *pagep = page;
2125 0 : return status;
2126 : }
2127 : EXPORT_SYMBOL(block_write_begin);
2128 :
2129 0 : int block_write_end(struct file *file, struct address_space *mapping,
2130 : loff_t pos, unsigned len, unsigned copied,
2131 : struct page *page, void *fsdata)
2132 : {
2133 0 : struct inode *inode = mapping->host;
2134 : unsigned start;
2135 :
2136 0 : start = pos & (PAGE_SIZE - 1);
2137 :
2138 0 : if (unlikely(copied < len)) {
2139 : /*
2140 : * The buffers that were written will now be uptodate, so we
2141 : * don't have to worry about a readpage reading them and
2142 : * overwriting a partial write. However if we have encountered
2143 : * a short write and only partially written into a buffer, it
2144 : * will not be marked uptodate, so a readpage might come in and
2145 : * destroy our partial write.
2146 : *
2147 : * Do the simplest thing, and just treat any short write to a
2148 : * non uptodate page as a zero-length write, and force the
2149 : * caller to redo the whole thing.
2150 : */
2151 0 : if (!PageUptodate(page))
2152 0 : copied = 0;
2153 :
2154 0 : page_zero_new_buffers(page, start+copied, start+len);
2155 : }
2156 0 : flush_dcache_page(page);
2157 :
2158 : /* This could be a short (even 0-length) commit */
2159 0 : __block_commit_write(inode, page, start, start+copied);
2160 :
2161 0 : return copied;
2162 : }
2163 : EXPORT_SYMBOL(block_write_end);
2164 :
2165 0 : int generic_write_end(struct file *file, struct address_space *mapping,
2166 : loff_t pos, unsigned len, unsigned copied,
2167 : struct page *page, void *fsdata)
2168 : {
2169 0 : struct inode *inode = mapping->host;
2170 0 : loff_t old_size = inode->i_size;
2171 0 : bool i_size_changed = false;
2172 :
2173 0 : copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2174 :
2175 : /*
2176 : * No need to use i_size_read() here, the i_size cannot change under us
2177 : * because we hold i_rwsem.
2178 : *
2179 : * But it's important to update i_size while still holding page lock:
2180 : * page writeout could otherwise come in and zero beyond i_size.
2181 : */
2182 0 : if (pos + copied > inode->i_size) {
2183 0 : i_size_write(inode, pos + copied);
2184 0 : i_size_changed = true;
2185 : }
2186 :
2187 0 : unlock_page(page);
2188 0 : put_page(page);
2189 :
2190 0 : if (old_size < pos)
2191 0 : pagecache_isize_extended(inode, old_size, pos);
2192 : /*
2193 : * Don't mark the inode dirty under page lock. First, it unnecessarily
2194 : * makes the holding time of page lock longer. Second, it forces lock
2195 : * ordering of page lock and transaction start for journaling
2196 : * filesystems.
2197 : */
2198 0 : if (i_size_changed)
2199 : mark_inode_dirty(inode);
2200 0 : return copied;
2201 : }
2202 : EXPORT_SYMBOL(generic_write_end);
2203 :
2204 : /*
2205 : * block_is_partially_uptodate checks whether buffers within a folio are
2206 : * uptodate or not.
2207 : *
2208 : * Returns true if all buffers which correspond to the specified part
2209 : * of the folio are uptodate.
2210 : */
2211 0 : bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2212 : {
2213 : unsigned block_start, block_end, blocksize;
2214 : unsigned to;
2215 : struct buffer_head *bh, *head;
2216 0 : bool ret = true;
2217 :
2218 0 : head = folio_buffers(folio);
2219 0 : if (!head)
2220 : return false;
2221 0 : blocksize = head->b_size;
2222 0 : to = min_t(unsigned, folio_size(folio) - from, count);
2223 0 : to = from + to;
2224 0 : if (from < blocksize && to > folio_size(folio) - blocksize)
2225 : return false;
2226 :
2227 : bh = head;
2228 : block_start = 0;
2229 : do {
2230 0 : block_end = block_start + blocksize;
2231 0 : if (block_end > from && block_start < to) {
2232 0 : if (!buffer_uptodate(bh)) {
2233 : ret = false;
2234 : break;
2235 : }
2236 0 : if (block_end >= to)
2237 : break;
2238 : }
2239 0 : block_start = block_end;
2240 0 : bh = bh->b_this_page;
2241 0 : } while (bh != head);
2242 :
2243 : return ret;
2244 : }
2245 : EXPORT_SYMBOL(block_is_partially_uptodate);
2246 :
2247 : /*
2248 : * Generic "read page" function for block devices that have the normal
2249 : * get_block functionality. This is most of the block device filesystems.
2250 : * Reads the page asynchronously --- the unlock_buffer() and
2251 : * set/clear_buffer_uptodate() functions propagate buffer state into the
2252 : * page struct once IO has completed.
2253 : */
2254 0 : int block_read_full_page(struct page *page, get_block_t *get_block)
2255 : {
2256 0 : struct inode *inode = page->mapping->host;
2257 : sector_t iblock, lblock;
2258 : struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2259 : unsigned int blocksize, bbits;
2260 : int nr, i;
2261 0 : int fully_mapped = 1;
2262 :
2263 0 : head = create_page_buffers(page, inode, 0);
2264 0 : blocksize = head->b_size;
2265 0 : bbits = block_size_bits(blocksize);
2266 :
2267 0 : iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
2268 0 : lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2269 0 : bh = head;
2270 0 : nr = 0;
2271 0 : i = 0;
2272 :
2273 : do {
2274 0 : if (buffer_uptodate(bh))
2275 0 : continue;
2276 :
2277 0 : if (!buffer_mapped(bh)) {
2278 0 : int err = 0;
2279 :
2280 0 : fully_mapped = 0;
2281 0 : if (iblock < lblock) {
2282 0 : WARN_ON(bh->b_size != blocksize);
2283 0 : err = get_block(inode, iblock, bh, 0);
2284 0 : if (err)
2285 : SetPageError(page);
2286 : }
2287 0 : if (!buffer_mapped(bh)) {
2288 0 : zero_user(page, i * blocksize, blocksize);
2289 0 : if (!err)
2290 : set_buffer_uptodate(bh);
2291 0 : continue;
2292 : }
2293 : /*
2294 : * get_block() might have updated the buffer
2295 : * synchronously
2296 : */
2297 0 : if (buffer_uptodate(bh))
2298 0 : continue;
2299 : }
2300 0 : arr[nr++] = bh;
2301 0 : } while (i++, iblock++, (bh = bh->b_this_page) != head);
2302 :
2303 0 : if (fully_mapped)
2304 : SetPageMappedToDisk(page);
2305 :
2306 0 : if (!nr) {
2307 : /*
2308 : * All buffers are uptodate - we can set the page uptodate
2309 : * as well. But not if get_block() returned an error.
2310 : */
2311 0 : if (!PageError(page))
2312 : SetPageUptodate(page);
2313 0 : unlock_page(page);
2314 0 : return 0;
2315 : }
2316 :
2317 : /* Stage two: lock the buffers */
2318 0 : for (i = 0; i < nr; i++) {
2319 0 : bh = arr[i];
2320 0 : lock_buffer(bh);
2321 0 : mark_buffer_async_read(bh);
2322 : }
2323 :
2324 : /*
2325 : * Stage 3: start the IO. Check for uptodateness
2326 : * inside the buffer lock in case another process reading
2327 : * the underlying blockdev brought it uptodate (the sct fix).
2328 : */
2329 0 : for (i = 0; i < nr; i++) {
2330 0 : bh = arr[i];
2331 0 : if (buffer_uptodate(bh))
2332 0 : end_buffer_async_read(bh, 1);
2333 : else
2334 : submit_bh(REQ_OP_READ, 0, bh);
2335 : }
2336 : return 0;
2337 : }
2338 : EXPORT_SYMBOL(block_read_full_page);
2339 :
2340 : /* utility function for filesystems that need to do work on expanding
2341 : * truncates. Uses filesystem pagecache writes to allow the filesystem to
2342 : * deal with the hole.
2343 : */
2344 0 : int generic_cont_expand_simple(struct inode *inode, loff_t size)
2345 : {
2346 0 : struct address_space *mapping = inode->i_mapping;
2347 : struct page *page;
2348 : void *fsdata;
2349 : int err;
2350 :
2351 0 : err = inode_newsize_ok(inode, size);
2352 0 : if (err)
2353 : goto out;
2354 :
2355 0 : err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
2356 0 : if (err)
2357 : goto out;
2358 :
2359 0 : err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2360 0 : BUG_ON(err > 0);
2361 :
2362 : out:
2363 0 : return err;
2364 : }
2365 : EXPORT_SYMBOL(generic_cont_expand_simple);
2366 :
2367 0 : static int cont_expand_zero(struct file *file, struct address_space *mapping,
2368 : loff_t pos, loff_t *bytes)
2369 : {
2370 0 : struct inode *inode = mapping->host;
2371 0 : unsigned int blocksize = i_blocksize(inode);
2372 : struct page *page;
2373 : void *fsdata;
2374 : pgoff_t index, curidx;
2375 : loff_t curpos;
2376 : unsigned zerofrom, offset, len;
2377 0 : int err = 0;
2378 :
2379 0 : index = pos >> PAGE_SHIFT;
2380 0 : offset = pos & ~PAGE_MASK;
2381 :
2382 0 : while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2383 0 : zerofrom = curpos & ~PAGE_MASK;
2384 0 : if (zerofrom & (blocksize-1)) {
2385 0 : *bytes |= (blocksize-1);
2386 0 : (*bytes)++;
2387 : }
2388 0 : len = PAGE_SIZE - zerofrom;
2389 :
2390 0 : err = pagecache_write_begin(file, mapping, curpos, len, 0,
2391 : &page, &fsdata);
2392 0 : if (err)
2393 : goto out;
2394 0 : zero_user(page, zerofrom, len);
2395 0 : err = pagecache_write_end(file, mapping, curpos, len, len,
2396 : page, fsdata);
2397 0 : if (err < 0)
2398 : goto out;
2399 0 : BUG_ON(err != len);
2400 0 : err = 0;
2401 :
2402 0 : balance_dirty_pages_ratelimited(mapping);
2403 :
2404 0 : if (fatal_signal_pending(current)) {
2405 : err = -EINTR;
2406 : goto out;
2407 : }
2408 : }
2409 :
2410 : /* page covers the boundary, find the boundary offset */
2411 0 : if (index == curidx) {
2412 0 : zerofrom = curpos & ~PAGE_MASK;
2413 : /* if we will expand the thing last block will be filled */
2414 0 : if (offset <= zerofrom) {
2415 : goto out;
2416 : }
2417 0 : if (zerofrom & (blocksize-1)) {
2418 0 : *bytes |= (blocksize-1);
2419 0 : (*bytes)++;
2420 : }
2421 0 : len = offset - zerofrom;
2422 :
2423 0 : err = pagecache_write_begin(file, mapping, curpos, len, 0,
2424 : &page, &fsdata);
2425 0 : if (err)
2426 : goto out;
2427 0 : zero_user(page, zerofrom, len);
2428 0 : err = pagecache_write_end(file, mapping, curpos, len, len,
2429 : page, fsdata);
2430 0 : if (err < 0)
2431 : goto out;
2432 0 : BUG_ON(err != len);
2433 : err = 0;
2434 : }
2435 : out:
2436 0 : return err;
2437 : }
2438 :
2439 : /*
2440 : * For moronic filesystems that do not allow holes in file.
2441 : * We may have to extend the file.
2442 : */
2443 0 : int cont_write_begin(struct file *file, struct address_space *mapping,
2444 : loff_t pos, unsigned len, unsigned flags,
2445 : struct page **pagep, void **fsdata,
2446 : get_block_t *get_block, loff_t *bytes)
2447 : {
2448 0 : struct inode *inode = mapping->host;
2449 0 : unsigned int blocksize = i_blocksize(inode);
2450 : unsigned int zerofrom;
2451 : int err;
2452 :
2453 0 : err = cont_expand_zero(file, mapping, pos, bytes);
2454 0 : if (err)
2455 : return err;
2456 :
2457 0 : zerofrom = *bytes & ~PAGE_MASK;
2458 0 : if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2459 0 : *bytes |= (blocksize-1);
2460 0 : (*bytes)++;
2461 : }
2462 :
2463 0 : return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2464 : }
2465 : EXPORT_SYMBOL(cont_write_begin);
2466 :
2467 0 : int block_commit_write(struct page *page, unsigned from, unsigned to)
2468 : {
2469 0 : struct inode *inode = page->mapping->host;
2470 0 : __block_commit_write(inode,page,from,to);
2471 0 : return 0;
2472 : }
2473 : EXPORT_SYMBOL(block_commit_write);
2474 :
2475 : /*
2476 : * block_page_mkwrite() is not allowed to change the file size as it gets
2477 : * called from a page fault handler when a page is first dirtied. Hence we must
2478 : * be careful to check for EOF conditions here. We set the page up correctly
2479 : * for a written page which means we get ENOSPC checking when writing into
2480 : * holes and correct delalloc and unwritten extent mapping on filesystems that
2481 : * support these features.
2482 : *
2483 : * We are not allowed to take the i_mutex here so we have to play games to
2484 : * protect against truncate races as the page could now be beyond EOF. Because
2485 : * truncate writes the inode size before removing pages, once we have the
2486 : * page lock we can determine safely if the page is beyond EOF. If it is not
2487 : * beyond EOF, then the page is guaranteed safe against truncation until we
2488 : * unlock the page.
2489 : *
2490 : * Direct callers of this function should protect against filesystem freezing
2491 : * using sb_start_pagefault() - sb_end_pagefault() functions.
2492 : */
2493 0 : int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2494 : get_block_t get_block)
2495 : {
2496 0 : struct page *page = vmf->page;
2497 0 : struct inode *inode = file_inode(vma->vm_file);
2498 : unsigned long end;
2499 : loff_t size;
2500 : int ret;
2501 :
2502 0 : lock_page(page);
2503 0 : size = i_size_read(inode);
2504 0 : if ((page->mapping != inode->i_mapping) ||
2505 0 : (page_offset(page) > size)) {
2506 : /* We overload EFAULT to mean page got truncated */
2507 : ret = -EFAULT;
2508 : goto out_unlock;
2509 : }
2510 :
2511 : /* page is wholly or partially inside EOF */
2512 0 : if (((page->index + 1) << PAGE_SHIFT) > size)
2513 0 : end = size & ~PAGE_MASK;
2514 : else
2515 : end = PAGE_SIZE;
2516 :
2517 0 : ret = __block_write_begin(page, 0, end, get_block);
2518 0 : if (!ret)
2519 0 : ret = block_commit_write(page, 0, end);
2520 :
2521 0 : if (unlikely(ret < 0))
2522 : goto out_unlock;
2523 0 : set_page_dirty(page);
2524 0 : wait_for_stable_page(page);
2525 0 : return 0;
2526 : out_unlock:
2527 0 : unlock_page(page);
2528 0 : return ret;
2529 : }
2530 : EXPORT_SYMBOL(block_page_mkwrite);
2531 :
2532 : /*
2533 : * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2534 : * immediately, while under the page lock. So it needs a special end_io
2535 : * handler which does not touch the bh after unlocking it.
2536 : */
2537 0 : static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2538 : {
2539 0 : __end_buffer_read_notouch(bh, uptodate);
2540 0 : }
2541 :
2542 : /*
2543 : * Attach the singly-linked list of buffers created by nobh_write_begin, to
2544 : * the page (converting it to circular linked list and taking care of page
2545 : * dirty races).
2546 : */
2547 0 : static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2548 : {
2549 : struct buffer_head *bh;
2550 :
2551 0 : BUG_ON(!PageLocked(page));
2552 :
2553 0 : spin_lock(&page->mapping->private_lock);
2554 0 : bh = head;
2555 : do {
2556 0 : if (PageDirty(page))
2557 : set_buffer_dirty(bh);
2558 0 : if (!bh->b_this_page)
2559 0 : bh->b_this_page = head;
2560 0 : bh = bh->b_this_page;
2561 0 : } while (bh != head);
2562 0 : attach_page_private(page, head);
2563 0 : spin_unlock(&page->mapping->private_lock);
2564 0 : }
2565 :
2566 : /*
2567 : * On entry, the page is fully not uptodate.
2568 : * On exit the page is fully uptodate in the areas outside (from,to)
2569 : * The filesystem needs to handle block truncation upon failure.
2570 : */
2571 0 : int nobh_write_begin(struct address_space *mapping,
2572 : loff_t pos, unsigned len, unsigned flags,
2573 : struct page **pagep, void **fsdata,
2574 : get_block_t *get_block)
2575 : {
2576 0 : struct inode *inode = mapping->host;
2577 0 : const unsigned blkbits = inode->i_blkbits;
2578 0 : const unsigned blocksize = 1 << blkbits;
2579 : struct buffer_head *head, *bh;
2580 : struct page *page;
2581 : pgoff_t index;
2582 : unsigned from, to;
2583 : unsigned block_in_page;
2584 : unsigned block_start, block_end;
2585 : sector_t block_in_file;
2586 0 : int nr_reads = 0;
2587 0 : int ret = 0;
2588 0 : int is_mapped_to_disk = 1;
2589 :
2590 0 : index = pos >> PAGE_SHIFT;
2591 0 : from = pos & (PAGE_SIZE - 1);
2592 0 : to = from + len;
2593 :
2594 0 : page = grab_cache_page_write_begin(mapping, index, flags);
2595 0 : if (!page)
2596 : return -ENOMEM;
2597 0 : *pagep = page;
2598 0 : *fsdata = NULL;
2599 :
2600 0 : if (page_has_buffers(page)) {
2601 0 : ret = __block_write_begin(page, pos, len, get_block);
2602 0 : if (unlikely(ret))
2603 : goto out_release;
2604 : return ret;
2605 : }
2606 :
2607 0 : if (PageMappedToDisk(page))
2608 : return 0;
2609 :
2610 : /*
2611 : * Allocate buffers so that we can keep track of state, and potentially
2612 : * attach them to the page if an error occurs. In the common case of
2613 : * no error, they will just be freed again without ever being attached
2614 : * to the page (which is all OK, because we're under the page lock).
2615 : *
2616 : * Be careful: the buffer linked list is a NULL terminated one, rather
2617 : * than the circular one we're used to.
2618 : */
2619 0 : head = alloc_page_buffers(page, blocksize, false);
2620 0 : if (!head) {
2621 : ret = -ENOMEM;
2622 : goto out_release;
2623 : }
2624 :
2625 0 : block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
2626 :
2627 : /*
2628 : * We loop across all blocks in the page, whether or not they are
2629 : * part of the affected region. This is so we can discover if the
2630 : * page is fully mapped-to-disk.
2631 : */
2632 0 : for (block_start = 0, block_in_page = 0, bh = head;
2633 : block_start < PAGE_SIZE;
2634 0 : block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2635 : int create;
2636 :
2637 0 : block_end = block_start + blocksize;
2638 0 : bh->b_state = 0;
2639 0 : create = 1;
2640 0 : if (block_start >= to)
2641 0 : create = 0;
2642 0 : ret = get_block(inode, block_in_file + block_in_page,
2643 : bh, create);
2644 0 : if (ret)
2645 : goto failed;
2646 0 : if (!buffer_mapped(bh))
2647 0 : is_mapped_to_disk = 0;
2648 0 : if (buffer_new(bh))
2649 0 : clean_bdev_bh_alias(bh);
2650 0 : if (PageUptodate(page)) {
2651 0 : set_buffer_uptodate(bh);
2652 0 : continue;
2653 : }
2654 0 : if (buffer_new(bh) || !buffer_mapped(bh)) {
2655 0 : zero_user_segments(page, block_start, from,
2656 : to, block_end);
2657 0 : continue;
2658 : }
2659 0 : if (buffer_uptodate(bh))
2660 0 : continue; /* reiserfs does this */
2661 0 : if (block_start < from || block_end > to) {
2662 0 : lock_buffer(bh);
2663 0 : bh->b_end_io = end_buffer_read_nobh;
2664 0 : submit_bh(REQ_OP_READ, 0, bh);
2665 0 : nr_reads++;
2666 : }
2667 : }
2668 :
2669 0 : if (nr_reads) {
2670 : /*
2671 : * The page is locked, so these buffers are protected from
2672 : * any VM or truncate activity. Hence we don't need to care
2673 : * for the buffer_head refcounts.
2674 : */
2675 0 : for (bh = head; bh; bh = bh->b_this_page) {
2676 0 : wait_on_buffer(bh);
2677 0 : if (!buffer_uptodate(bh))
2678 0 : ret = -EIO;
2679 : }
2680 0 : if (ret)
2681 : goto failed;
2682 : }
2683 :
2684 0 : if (is_mapped_to_disk)
2685 : SetPageMappedToDisk(page);
2686 :
2687 0 : *fsdata = head; /* to be released by nobh_write_end */
2688 :
2689 0 : return 0;
2690 :
2691 : failed:
2692 0 : BUG_ON(!ret);
2693 : /*
2694 : * Error recovery is a bit difficult. We need to zero out blocks that
2695 : * were newly allocated, and dirty them to ensure they get written out.
2696 : * Buffers need to be attached to the page at this point, otherwise
2697 : * the handling of potential IO errors during writeout would be hard
2698 : * (could try doing synchronous writeout, but what if that fails too?)
2699 : */
2700 0 : attach_nobh_buffers(page, head);
2701 0 : page_zero_new_buffers(page, from, to);
2702 :
2703 : out_release:
2704 0 : unlock_page(page);
2705 0 : put_page(page);
2706 0 : *pagep = NULL;
2707 :
2708 0 : return ret;
2709 : }
2710 : EXPORT_SYMBOL(nobh_write_begin);
2711 :
2712 0 : int nobh_write_end(struct file *file, struct address_space *mapping,
2713 : loff_t pos, unsigned len, unsigned copied,
2714 : struct page *page, void *fsdata)
2715 : {
2716 0 : struct inode *inode = page->mapping->host;
2717 0 : struct buffer_head *head = fsdata;
2718 : struct buffer_head *bh;
2719 0 : BUG_ON(fsdata != NULL && page_has_buffers(page));
2720 :
2721 0 : if (unlikely(copied < len) && head)
2722 0 : attach_nobh_buffers(page, head);
2723 0 : if (page_has_buffers(page))
2724 0 : return generic_write_end(file, mapping, pos, len,
2725 : copied, page, fsdata);
2726 :
2727 0 : SetPageUptodate(page);
2728 0 : set_page_dirty(page);
2729 0 : if (pos+copied > inode->i_size) {
2730 0 : i_size_write(inode, pos+copied);
2731 : mark_inode_dirty(inode);
2732 : }
2733 :
2734 0 : unlock_page(page);
2735 0 : put_page(page);
2736 :
2737 0 : while (head) {
2738 0 : bh = head;
2739 0 : head = head->b_this_page;
2740 0 : free_buffer_head(bh);
2741 : }
2742 :
2743 0 : return copied;
2744 : }
2745 : EXPORT_SYMBOL(nobh_write_end);
2746 :
2747 : /*
2748 : * nobh_writepage() - based on block_full_write_page() except
2749 : * that it tries to operate without attaching bufferheads to
2750 : * the page.
2751 : */
2752 0 : int nobh_writepage(struct page *page, get_block_t *get_block,
2753 : struct writeback_control *wbc)
2754 : {
2755 0 : struct inode * const inode = page->mapping->host;
2756 0 : loff_t i_size = i_size_read(inode);
2757 0 : const pgoff_t end_index = i_size >> PAGE_SHIFT;
2758 : unsigned offset;
2759 : int ret;
2760 :
2761 : /* Is the page fully inside i_size? */
2762 0 : if (page->index < end_index)
2763 : goto out;
2764 :
2765 : /* Is the page fully outside i_size? (truncate in progress) */
2766 0 : offset = i_size & (PAGE_SIZE-1);
2767 0 : if (page->index >= end_index+1 || !offset) {
2768 0 : unlock_page(page);
2769 0 : return 0; /* don't care */
2770 : }
2771 :
2772 : /*
2773 : * The page straddles i_size. It must be zeroed out on each and every
2774 : * writepage invocation because it may be mmapped. "A file is mapped
2775 : * in multiples of the page size. For a file that is not a multiple of
2776 : * the page size, the remaining memory is zeroed when mapped, and
2777 : * writes to that region are not written out to the file."
2778 : */
2779 : zero_user_segment(page, offset, PAGE_SIZE);
2780 : out:
2781 0 : ret = mpage_writepage(page, get_block, wbc);
2782 0 : if (ret == -EAGAIN)
2783 0 : ret = __block_write_full_page(inode, page, get_block, wbc,
2784 : end_buffer_async_write);
2785 : return ret;
2786 : }
2787 : EXPORT_SYMBOL(nobh_writepage);
2788 :
2789 0 : int nobh_truncate_page(struct address_space *mapping,
2790 : loff_t from, get_block_t *get_block)
2791 : {
2792 0 : pgoff_t index = from >> PAGE_SHIFT;
2793 0 : unsigned offset = from & (PAGE_SIZE-1);
2794 : unsigned blocksize;
2795 : sector_t iblock;
2796 : unsigned length, pos;
2797 0 : struct inode *inode = mapping->host;
2798 : struct page *page;
2799 : struct buffer_head map_bh;
2800 : int err;
2801 :
2802 0 : blocksize = i_blocksize(inode);
2803 0 : length = offset & (blocksize - 1);
2804 :
2805 : /* Block boundary? Nothing to do */
2806 0 : if (!length)
2807 : return 0;
2808 :
2809 0 : length = blocksize - length;
2810 0 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2811 :
2812 0 : page = grab_cache_page(mapping, index);
2813 0 : err = -ENOMEM;
2814 0 : if (!page)
2815 : goto out;
2816 :
2817 0 : if (page_has_buffers(page)) {
2818 : has_buffers:
2819 0 : unlock_page(page);
2820 0 : put_page(page);
2821 0 : return block_truncate_page(mapping, from, get_block);
2822 : }
2823 :
2824 : /* Find the buffer that contains "offset" */
2825 : pos = blocksize;
2826 0 : while (offset >= pos) {
2827 0 : iblock++;
2828 0 : pos += blocksize;
2829 : }
2830 :
2831 0 : map_bh.b_size = blocksize;
2832 0 : map_bh.b_state = 0;
2833 0 : err = get_block(inode, iblock, &map_bh, 0);
2834 0 : if (err)
2835 : goto unlock;
2836 : /* unmapped? It's a hole - nothing to do */
2837 0 : if (!buffer_mapped(&map_bh))
2838 : goto unlock;
2839 :
2840 : /* Ok, it's mapped. Make sure it's up-to-date */
2841 0 : if (!PageUptodate(page)) {
2842 0 : err = mapping->a_ops->readpage(NULL, page);
2843 0 : if (err) {
2844 0 : put_page(page);
2845 0 : goto out;
2846 : }
2847 0 : lock_page(page);
2848 0 : if (!PageUptodate(page)) {
2849 : err = -EIO;
2850 : goto unlock;
2851 : }
2852 0 : if (page_has_buffers(page))
2853 : goto has_buffers;
2854 : }
2855 0 : zero_user(page, offset, length);
2856 0 : set_page_dirty(page);
2857 0 : err = 0;
2858 :
2859 : unlock:
2860 0 : unlock_page(page);
2861 0 : put_page(page);
2862 : out:
2863 : return err;
2864 : }
2865 : EXPORT_SYMBOL(nobh_truncate_page);
2866 :
2867 0 : int block_truncate_page(struct address_space *mapping,
2868 : loff_t from, get_block_t *get_block)
2869 : {
2870 0 : pgoff_t index = from >> PAGE_SHIFT;
2871 0 : unsigned offset = from & (PAGE_SIZE-1);
2872 : unsigned blocksize;
2873 : sector_t iblock;
2874 : unsigned length, pos;
2875 0 : struct inode *inode = mapping->host;
2876 : struct page *page;
2877 : struct buffer_head *bh;
2878 : int err;
2879 :
2880 0 : blocksize = i_blocksize(inode);
2881 0 : length = offset & (blocksize - 1);
2882 :
2883 : /* Block boundary? Nothing to do */
2884 0 : if (!length)
2885 : return 0;
2886 :
2887 0 : length = blocksize - length;
2888 0 : iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2889 :
2890 0 : page = grab_cache_page(mapping, index);
2891 0 : err = -ENOMEM;
2892 0 : if (!page)
2893 : goto out;
2894 :
2895 0 : if (!page_has_buffers(page))
2896 0 : create_empty_buffers(page, blocksize, 0);
2897 :
2898 : /* Find the buffer that contains "offset" */
2899 0 : bh = page_buffers(page);
2900 0 : pos = blocksize;
2901 0 : while (offset >= pos) {
2902 0 : bh = bh->b_this_page;
2903 0 : iblock++;
2904 0 : pos += blocksize;
2905 : }
2906 :
2907 0 : err = 0;
2908 0 : if (!buffer_mapped(bh)) {
2909 0 : WARN_ON(bh->b_size != blocksize);
2910 0 : err = get_block(inode, iblock, bh, 0);
2911 0 : if (err)
2912 : goto unlock;
2913 : /* unmapped? It's a hole - nothing to do */
2914 0 : if (!buffer_mapped(bh))
2915 : goto unlock;
2916 : }
2917 :
2918 : /* Ok, it's mapped. Make sure it's up-to-date */
2919 0 : if (PageUptodate(page))
2920 0 : set_buffer_uptodate(bh);
2921 :
2922 0 : if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2923 0 : err = -EIO;
2924 0 : ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2925 0 : wait_on_buffer(bh);
2926 : /* Uhhuh. Read error. Complain and punt. */
2927 0 : if (!buffer_uptodate(bh))
2928 : goto unlock;
2929 : }
2930 :
2931 0 : zero_user(page, offset, length);
2932 0 : mark_buffer_dirty(bh);
2933 0 : err = 0;
2934 :
2935 : unlock:
2936 0 : unlock_page(page);
2937 0 : put_page(page);
2938 : out:
2939 : return err;
2940 : }
2941 : EXPORT_SYMBOL(block_truncate_page);
2942 :
2943 : /*
2944 : * The generic ->writepage function for buffer-backed address_spaces
2945 : */
2946 0 : int block_write_full_page(struct page *page, get_block_t *get_block,
2947 : struct writeback_control *wbc)
2948 : {
2949 0 : struct inode * const inode = page->mapping->host;
2950 0 : loff_t i_size = i_size_read(inode);
2951 0 : const pgoff_t end_index = i_size >> PAGE_SHIFT;
2952 : unsigned offset;
2953 :
2954 : /* Is the page fully inside i_size? */
2955 0 : if (page->index < end_index)
2956 0 : return __block_write_full_page(inode, page, get_block, wbc,
2957 : end_buffer_async_write);
2958 :
2959 : /* Is the page fully outside i_size? (truncate in progress) */
2960 0 : offset = i_size & (PAGE_SIZE-1);
2961 0 : if (page->index >= end_index+1 || !offset) {
2962 0 : unlock_page(page);
2963 0 : return 0; /* don't care */
2964 : }
2965 :
2966 : /*
2967 : * The page straddles i_size. It must be zeroed out on each and every
2968 : * writepage invocation because it may be mmapped. "A file is mapped
2969 : * in multiples of the page size. For a file that is not a multiple of
2970 : * the page size, the remaining memory is zeroed when mapped, and
2971 : * writes to that region are not written out to the file."
2972 : */
2973 0 : zero_user_segment(page, offset, PAGE_SIZE);
2974 0 : return __block_write_full_page(inode, page, get_block, wbc,
2975 : end_buffer_async_write);
2976 : }
2977 : EXPORT_SYMBOL(block_write_full_page);
2978 :
2979 0 : sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2980 : get_block_t *get_block)
2981 : {
2982 0 : struct inode *inode = mapping->host;
2983 0 : struct buffer_head tmp = {
2984 0 : .b_size = i_blocksize(inode),
2985 : };
2986 :
2987 0 : get_block(inode, block, &tmp, 0);
2988 0 : return tmp.b_blocknr;
2989 : }
2990 : EXPORT_SYMBOL(generic_block_bmap);
2991 :
2992 0 : static void end_bio_bh_io_sync(struct bio *bio)
2993 : {
2994 0 : struct buffer_head *bh = bio->bi_private;
2995 :
2996 0 : if (unlikely(bio_flagged(bio, BIO_QUIET)))
2997 0 : set_bit(BH_Quiet, &bh->b_state);
2998 :
2999 0 : bh->b_end_io(bh, !bio->bi_status);
3000 0 : bio_put(bio);
3001 0 : }
3002 :
3003 0 : static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3004 : struct writeback_control *wbc)
3005 : {
3006 : struct bio *bio;
3007 :
3008 0 : BUG_ON(!buffer_locked(bh));
3009 0 : BUG_ON(!buffer_mapped(bh));
3010 0 : BUG_ON(!bh->b_end_io);
3011 0 : BUG_ON(buffer_delay(bh));
3012 0 : BUG_ON(buffer_unwritten(bh));
3013 :
3014 : /*
3015 : * Only clear out a write error when rewriting
3016 : */
3017 0 : if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
3018 : clear_buffer_write_io_error(bh);
3019 :
3020 0 : if (buffer_meta(bh))
3021 0 : op_flags |= REQ_META;
3022 0 : if (buffer_prio(bh))
3023 0 : op_flags |= REQ_PRIO;
3024 :
3025 0 : bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
3026 :
3027 0 : fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
3028 :
3029 0 : bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3030 :
3031 0 : bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3032 0 : BUG_ON(bio->bi_iter.bi_size != bh->b_size);
3033 :
3034 0 : bio->bi_end_io = end_bio_bh_io_sync;
3035 0 : bio->bi_private = bh;
3036 :
3037 : /* Take care of bh's that straddle the end of the device */
3038 0 : guard_bio_eod(bio);
3039 :
3040 : if (wbc) {
3041 : wbc_init_bio(wbc, bio);
3042 : wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
3043 : }
3044 :
3045 0 : submit_bio(bio);
3046 0 : return 0;
3047 : }
3048 :
3049 0 : int submit_bh(int op, int op_flags, struct buffer_head *bh)
3050 : {
3051 0 : return submit_bh_wbc(op, op_flags, bh, NULL);
3052 : }
3053 : EXPORT_SYMBOL(submit_bh);
3054 :
3055 : /**
3056 : * ll_rw_block: low-level access to block devices (DEPRECATED)
3057 : * @op: whether to %READ or %WRITE
3058 : * @op_flags: req_flag_bits
3059 : * @nr: number of &struct buffer_heads in the array
3060 : * @bhs: array of pointers to &struct buffer_head
3061 : *
3062 : * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3063 : * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
3064 : * @op_flags contains flags modifying the detailed I/O behavior, most notably
3065 : * %REQ_RAHEAD.
3066 : *
3067 : * This function drops any buffer that it cannot get a lock on (with the
3068 : * BH_Lock state bit), any buffer that appears to be clean when doing a write
3069 : * request, and any buffer that appears to be up-to-date when doing read
3070 : * request. Further it marks as clean buffers that are processed for
3071 : * writing (the buffer cache won't assume that they are actually clean
3072 : * until the buffer gets unlocked).
3073 : *
3074 : * ll_rw_block sets b_end_io to simple completion handler that marks
3075 : * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3076 : * any waiters.
3077 : *
3078 : * All of the buffers must be for the same device, and must also be a
3079 : * multiple of the current approved size for the device.
3080 : */
3081 0 : void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
3082 : {
3083 : int i;
3084 :
3085 0 : for (i = 0; i < nr; i++) {
3086 0 : struct buffer_head *bh = bhs[i];
3087 :
3088 0 : if (!trylock_buffer(bh))
3089 0 : continue;
3090 0 : if (op == WRITE) {
3091 0 : if (test_clear_buffer_dirty(bh)) {
3092 0 : bh->b_end_io = end_buffer_write_sync;
3093 0 : get_bh(bh);
3094 0 : submit_bh(op, op_flags, bh);
3095 0 : continue;
3096 : }
3097 : } else {
3098 0 : if (!buffer_uptodate(bh)) {
3099 0 : bh->b_end_io = end_buffer_read_sync;
3100 0 : get_bh(bh);
3101 0 : submit_bh(op, op_flags, bh);
3102 0 : continue;
3103 : }
3104 : }
3105 : unlock_buffer(bh);
3106 : }
3107 0 : }
3108 : EXPORT_SYMBOL(ll_rw_block);
3109 :
3110 0 : void write_dirty_buffer(struct buffer_head *bh, int op_flags)
3111 : {
3112 0 : lock_buffer(bh);
3113 0 : if (!test_clear_buffer_dirty(bh)) {
3114 : unlock_buffer(bh);
3115 : return;
3116 : }
3117 0 : bh->b_end_io = end_buffer_write_sync;
3118 0 : get_bh(bh);
3119 : submit_bh(REQ_OP_WRITE, op_flags, bh);
3120 : }
3121 : EXPORT_SYMBOL(write_dirty_buffer);
3122 :
3123 : /*
3124 : * For a data-integrity writeout, we need to wait upon any in-progress I/O
3125 : * and then start new I/O and then wait upon it. The caller must have a ref on
3126 : * the buffer_head.
3127 : */
3128 0 : int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
3129 : {
3130 0 : int ret = 0;
3131 :
3132 0 : WARN_ON(atomic_read(&bh->b_count) < 1);
3133 0 : lock_buffer(bh);
3134 0 : if (test_clear_buffer_dirty(bh)) {
3135 : /*
3136 : * The bh should be mapped, but it might not be if the
3137 : * device was hot-removed. Not much we can do but fail the I/O.
3138 : */
3139 0 : if (!buffer_mapped(bh)) {
3140 0 : unlock_buffer(bh);
3141 0 : return -EIO;
3142 : }
3143 :
3144 0 : get_bh(bh);
3145 0 : bh->b_end_io = end_buffer_write_sync;
3146 0 : ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
3147 0 : wait_on_buffer(bh);
3148 0 : if (!ret && !buffer_uptodate(bh))
3149 0 : ret = -EIO;
3150 : } else {
3151 : unlock_buffer(bh);
3152 : }
3153 : return ret;
3154 : }
3155 : EXPORT_SYMBOL(__sync_dirty_buffer);
3156 :
3157 0 : int sync_dirty_buffer(struct buffer_head *bh)
3158 : {
3159 0 : return __sync_dirty_buffer(bh, REQ_SYNC);
3160 : }
3161 : EXPORT_SYMBOL(sync_dirty_buffer);
3162 :
3163 : /*
3164 : * try_to_free_buffers() checks if all the buffers on this particular page
3165 : * are unused, and releases them if so.
3166 : *
3167 : * Exclusion against try_to_free_buffers may be obtained by either
3168 : * locking the page or by holding its mapping's private_lock.
3169 : *
3170 : * If the page is dirty but all the buffers are clean then we need to
3171 : * be sure to mark the page clean as well. This is because the page
3172 : * may be against a block device, and a later reattachment of buffers
3173 : * to a dirty page will set *all* buffers dirty. Which would corrupt
3174 : * filesystem data on the same device.
3175 : *
3176 : * The same applies to regular filesystem pages: if all the buffers are
3177 : * clean then we set the page clean and proceed. To do that, we require
3178 : * total exclusion from block_dirty_folio(). That is obtained with
3179 : * private_lock.
3180 : *
3181 : * try_to_free_buffers() is non-blocking.
3182 : */
3183 : static inline int buffer_busy(struct buffer_head *bh)
3184 : {
3185 0 : return atomic_read(&bh->b_count) |
3186 0 : (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3187 : }
3188 :
3189 : static int
3190 0 : drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3191 : {
3192 0 : struct buffer_head *head = page_buffers(page);
3193 : struct buffer_head *bh;
3194 :
3195 0 : bh = head;
3196 : do {
3197 0 : if (buffer_busy(bh))
3198 : goto failed;
3199 0 : bh = bh->b_this_page;
3200 0 : } while (bh != head);
3201 :
3202 : do {
3203 0 : struct buffer_head *next = bh->b_this_page;
3204 :
3205 0 : if (bh->b_assoc_map)
3206 0 : __remove_assoc_queue(bh);
3207 0 : bh = next;
3208 0 : } while (bh != head);
3209 0 : *buffers_to_free = head;
3210 0 : detach_page_private(page);
3211 0 : return 1;
3212 : failed:
3213 : return 0;
3214 : }
3215 :
3216 0 : int try_to_free_buffers(struct page *page)
3217 : {
3218 0 : struct address_space * const mapping = page->mapping;
3219 0 : struct buffer_head *buffers_to_free = NULL;
3220 0 : int ret = 0;
3221 :
3222 0 : BUG_ON(!PageLocked(page));
3223 0 : if (PageWriteback(page))
3224 : return 0;
3225 :
3226 0 : if (mapping == NULL) { /* can this still happen? */
3227 0 : ret = drop_buffers(page, &buffers_to_free);
3228 0 : goto out;
3229 : }
3230 :
3231 0 : spin_lock(&mapping->private_lock);
3232 0 : ret = drop_buffers(page, &buffers_to_free);
3233 :
3234 : /*
3235 : * If the filesystem writes its buffers by hand (eg ext3)
3236 : * then we can have clean buffers against a dirty page. We
3237 : * clean the page here; otherwise the VM will never notice
3238 : * that the filesystem did any IO at all.
3239 : *
3240 : * Also, during truncate, discard_buffer will have marked all
3241 : * the page's buffers clean. We discover that here and clean
3242 : * the page also.
3243 : *
3244 : * private_lock must be held over this entire operation in order
3245 : * to synchronise against block_dirty_folio and prevent the
3246 : * dirty bit from being lost.
3247 : */
3248 0 : if (ret)
3249 0 : cancel_dirty_page(page);
3250 0 : spin_unlock(&mapping->private_lock);
3251 : out:
3252 0 : if (buffers_to_free) {
3253 : struct buffer_head *bh = buffers_to_free;
3254 :
3255 : do {
3256 0 : struct buffer_head *next = bh->b_this_page;
3257 0 : free_buffer_head(bh);
3258 0 : bh = next;
3259 0 : } while (bh != buffers_to_free);
3260 : }
3261 : return ret;
3262 : }
3263 : EXPORT_SYMBOL(try_to_free_buffers);
3264 :
3265 : /*
3266 : * Buffer-head allocation
3267 : */
3268 : static struct kmem_cache *bh_cachep __read_mostly;
3269 :
3270 : /*
3271 : * Once the number of bh's in the machine exceeds this level, we start
3272 : * stripping them in writeback.
3273 : */
3274 : static unsigned long max_buffer_heads;
3275 :
3276 : int buffer_heads_over_limit;
3277 :
3278 : struct bh_accounting {
3279 : int nr; /* Number of live bh's */
3280 : int ratelimit; /* Limit cacheline bouncing */
3281 : };
3282 :
3283 : static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3284 :
3285 : static void recalc_bh_state(void)
3286 : {
3287 : int i;
3288 0 : int tot = 0;
3289 :
3290 0 : if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3291 : return;
3292 0 : __this_cpu_write(bh_accounting.ratelimit, 0);
3293 0 : for_each_online_cpu(i)
3294 0 : tot += per_cpu(bh_accounting, i).nr;
3295 0 : buffer_heads_over_limit = (tot > max_buffer_heads);
3296 : }
3297 :
3298 0 : struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3299 : {
3300 0 : struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3301 0 : if (ret) {
3302 0 : INIT_LIST_HEAD(&ret->b_assoc_buffers);
3303 0 : spin_lock_init(&ret->b_uptodate_lock);
3304 0 : preempt_disable();
3305 0 : __this_cpu_inc(bh_accounting.nr);
3306 0 : recalc_bh_state();
3307 0 : preempt_enable();
3308 : }
3309 0 : return ret;
3310 : }
3311 : EXPORT_SYMBOL(alloc_buffer_head);
3312 :
3313 0 : void free_buffer_head(struct buffer_head *bh)
3314 : {
3315 0 : BUG_ON(!list_empty(&bh->b_assoc_buffers));
3316 0 : kmem_cache_free(bh_cachep, bh);
3317 0 : preempt_disable();
3318 0 : __this_cpu_dec(bh_accounting.nr);
3319 0 : recalc_bh_state();
3320 0 : preempt_enable();
3321 0 : }
3322 : EXPORT_SYMBOL(free_buffer_head);
3323 :
3324 0 : static int buffer_exit_cpu_dead(unsigned int cpu)
3325 : {
3326 : int i;
3327 0 : struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3328 :
3329 0 : for (i = 0; i < BH_LRU_SIZE; i++) {
3330 0 : brelse(b->bhs[i]);
3331 0 : b->bhs[i] = NULL;
3332 : }
3333 0 : this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3334 0 : per_cpu(bh_accounting, cpu).nr = 0;
3335 0 : return 0;
3336 : }
3337 :
3338 : /**
3339 : * bh_uptodate_or_lock - Test whether the buffer is uptodate
3340 : * @bh: struct buffer_head
3341 : *
3342 : * Return true if the buffer is up-to-date and false,
3343 : * with the buffer locked, if not.
3344 : */
3345 0 : int bh_uptodate_or_lock(struct buffer_head *bh)
3346 : {
3347 0 : if (!buffer_uptodate(bh)) {
3348 0 : lock_buffer(bh);
3349 0 : if (!buffer_uptodate(bh))
3350 : return 0;
3351 : unlock_buffer(bh);
3352 : }
3353 : return 1;
3354 : }
3355 : EXPORT_SYMBOL(bh_uptodate_or_lock);
3356 :
3357 : /**
3358 : * bh_submit_read - Submit a locked buffer for reading
3359 : * @bh: struct buffer_head
3360 : *
3361 : * Returns zero on success and -EIO on error.
3362 : */
3363 0 : int bh_submit_read(struct buffer_head *bh)
3364 : {
3365 0 : BUG_ON(!buffer_locked(bh));
3366 :
3367 0 : if (buffer_uptodate(bh)) {
3368 0 : unlock_buffer(bh);
3369 0 : return 0;
3370 : }
3371 :
3372 0 : get_bh(bh);
3373 0 : bh->b_end_io = end_buffer_read_sync;
3374 0 : submit_bh(REQ_OP_READ, 0, bh);
3375 0 : wait_on_buffer(bh);
3376 0 : if (buffer_uptodate(bh))
3377 : return 0;
3378 0 : return -EIO;
3379 : }
3380 : EXPORT_SYMBOL(bh_submit_read);
3381 :
3382 1 : void __init buffer_init(void)
3383 : {
3384 : unsigned long nrpages;
3385 : int ret;
3386 :
3387 1 : bh_cachep = kmem_cache_create("buffer_head",
3388 : sizeof(struct buffer_head), 0,
3389 : (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3390 : SLAB_MEM_SPREAD),
3391 : NULL);
3392 :
3393 : /*
3394 : * Limit the bh occupancy to 10% of ZONE_NORMAL
3395 : */
3396 1 : nrpages = (nr_free_buffer_pages() * 10) / 100;
3397 1 : max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3398 1 : ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3399 : NULL, buffer_exit_cpu_dead);
3400 1 : WARN_ON(ret < 0);
3401 1 : }
|