LCOV - code coverage report
Current view: top level - mm - filemap.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 5 1174 0.4 %
Date: 2022-12-09 01:23:36 Functions: 1 95 1.1 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *      linux/mm/filemap.c
       4             :  *
       5             :  * Copyright (C) 1994-1999  Linus Torvalds
       6             :  */
       7             : 
       8             : /*
       9             :  * This file handles the generic file mmap semantics used by
      10             :  * most "normal" filesystems (but you don't /have/ to use this:
      11             :  * the NFS filesystem used to do this differently, for example)
      12             :  */
      13             : #include <linux/export.h>
      14             : #include <linux/compiler.h>
      15             : #include <linux/dax.h>
      16             : #include <linux/fs.h>
      17             : #include <linux/sched/signal.h>
      18             : #include <linux/uaccess.h>
      19             : #include <linux/capability.h>
      20             : #include <linux/kernel_stat.h>
      21             : #include <linux/gfp.h>
      22             : #include <linux/mm.h>
      23             : #include <linux/swap.h>
      24             : #include <linux/swapops.h>
      25             : #include <linux/mman.h>
      26             : #include <linux/pagemap.h>
      27             : #include <linux/file.h>
      28             : #include <linux/uio.h>
      29             : #include <linux/error-injection.h>
      30             : #include <linux/hash.h>
      31             : #include <linux/writeback.h>
      32             : #include <linux/backing-dev.h>
      33             : #include <linux/pagevec.h>
      34             : #include <linux/security.h>
      35             : #include <linux/cpuset.h>
      36             : #include <linux/hugetlb.h>
      37             : #include <linux/memcontrol.h>
      38             : #include <linux/shmem_fs.h>
      39             : #include <linux/rmap.h>
      40             : #include <linux/delayacct.h>
      41             : #include <linux/psi.h>
      42             : #include <linux/ramfs.h>
      43             : #include <linux/page_idle.h>
      44             : #include <linux/migrate.h>
      45             : #include <asm/pgalloc.h>
      46             : #include <asm/tlbflush.h>
      47             : #include "internal.h"
      48             : 
      49             : #define CREATE_TRACE_POINTS
      50             : #include <trace/events/filemap.h>
      51             : 
      52             : /*
      53             :  * FIXME: remove all knowledge of the buffer layer from the core VM
      54             :  */
      55             : #include <linux/buffer_head.h> /* for try_to_free_buffers */
      56             : 
      57             : #include <asm/mman.h>
      58             : 
      59             : /*
      60             :  * Shared mappings implemented 30.11.1994. It's not fully working yet,
      61             :  * though.
      62             :  *
      63             :  * Shared mappings now work. 15.8.1995  Bruno.
      64             :  *
      65             :  * finished 'unifying' the page and buffer cache and SMP-threaded the
      66             :  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
      67             :  *
      68             :  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
      69             :  */
      70             : 
      71             : /*
      72             :  * Lock ordering:
      73             :  *
      74             :  *  ->i_mmap_rwsem           (truncate_pagecache)
      75             :  *    ->private_lock         (__free_pte->block_dirty_folio)
      76             :  *      ->swap_lock          (exclusive_swap_page, others)
      77             :  *        ->i_pages lock
      78             :  *
      79             :  *  ->i_rwsem
      80             :  *    ->invalidate_lock              (acquired by fs in truncate path)
      81             :  *      ->i_mmap_rwsem               (truncate->unmap_mapping_range)
      82             :  *
      83             :  *  ->mmap_lock
      84             :  *    ->i_mmap_rwsem
      85             :  *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
      86             :  *        ->i_pages lock     (arch-dependent flush_dcache_mmap_lock)
      87             :  *
      88             :  *  ->mmap_lock
      89             :  *    ->invalidate_lock              (filemap_fault)
      90             :  *      ->lock_page          (filemap_fault, access_process_vm)
      91             :  *
      92             :  *  ->i_rwsem                        (generic_perform_write)
      93             :  *    ->mmap_lock            (fault_in_readable->do_page_fault)
      94             :  *
      95             :  *  bdi->wb.list_lock
      96             :  *    sb_lock                   (fs/fs-writeback.c)
      97             :  *    ->i_pages lock         (__sync_single_inode)
      98             :  *
      99             :  *  ->i_mmap_rwsem
     100             :  *    ->anon_vma.lock                (vma_adjust)
     101             :  *
     102             :  *  ->anon_vma.lock
     103             :  *    ->page_table_lock or pte_lock  (anon_vma_prepare and various)
     104             :  *
     105             :  *  ->page_table_lock or pte_lock
     106             :  *    ->swap_lock            (try_to_unmap_one)
     107             :  *    ->private_lock         (try_to_unmap_one)
     108             :  *    ->i_pages lock         (try_to_unmap_one)
     109             :  *    ->lruvec->lru_lock  (follow_page->mark_page_accessed)
     110             :  *    ->lruvec->lru_lock  (check_pte_range->isolate_lru_page)
     111             :  *    ->private_lock         (page_remove_rmap->set_page_dirty)
     112             :  *    ->i_pages lock         (page_remove_rmap->set_page_dirty)
     113             :  *    bdi.wb->list_lock              (page_remove_rmap->set_page_dirty)
     114             :  *    ->inode->i_lock             (page_remove_rmap->set_page_dirty)
     115             :  *    ->memcg->move_lock  (page_remove_rmap->lock_page_memcg)
     116             :  *    bdi.wb->list_lock              (zap_pte_range->set_page_dirty)
     117             :  *    ->inode->i_lock             (zap_pte_range->set_page_dirty)
     118             :  *    ->private_lock         (zap_pte_range->block_dirty_folio)
     119             :  *
     120             :  * ->i_mmap_rwsem
     121             :  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
     122             :  */
     123             : 
     124           0 : static void page_cache_delete(struct address_space *mapping,
     125             :                                    struct folio *folio, void *shadow)
     126             : {
     127           0 :         XA_STATE(xas, &mapping->i_pages, folio->index);
     128           0 :         long nr = 1;
     129             : 
     130           0 :         mapping_set_update(&xas, mapping);
     131             : 
     132             :         /* hugetlb pages are represented by a single entry in the xarray */
     133           0 :         if (!folio_test_hugetlb(folio)) {
     134           0 :                 xas_set_order(&xas, folio->index, folio_order(folio));
     135           0 :                 nr = folio_nr_pages(folio);
     136             :         }
     137             : 
     138             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
     139             : 
     140           0 :         xas_store(&xas, shadow);
     141           0 :         xas_init_marks(&xas);
     142             : 
     143           0 :         folio->mapping = NULL;
     144             :         /* Leave page->index set: truncation lookup relies upon it */
     145           0 :         mapping->nrpages -= nr;
     146           0 : }
     147             : 
     148           0 : static void filemap_unaccount_folio(struct address_space *mapping,
     149             :                 struct folio *folio)
     150             : {
     151             :         long nr;
     152             : 
     153             :         VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
     154           0 :         if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
     155           0 :                 pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
     156             :                          current->comm, folio_pfn(folio));
     157           0 :                 dump_page(&folio->page, "still mapped when deleted");
     158           0 :                 dump_stack();
     159           0 :                 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     160             : 
     161           0 :                 if (mapping_exiting(mapping) && !folio_test_large(folio)) {
     162           0 :                         int mapcount = page_mapcount(&folio->page);
     163             : 
     164           0 :                         if (folio_ref_count(folio) >= mapcount + 2) {
     165             :                                 /*
     166             :                                  * All vmas have already been torn down, so it's
     167             :                                  * a good bet that actually the page is unmapped
     168             :                                  * and we'd rather not leak it: if we're wrong,
     169             :                                  * another bad page check should catch it later.
     170             :                                  */
     171           0 :                                 page_mapcount_reset(&folio->page);
     172             :                                 folio_ref_sub(folio, mapcount);
     173             :                         }
     174             :                 }
     175             :         }
     176             : 
     177             :         /* hugetlb folios do not participate in page cache accounting. */
     178           0 :         if (folio_test_hugetlb(folio))
     179             :                 return;
     180             : 
     181           0 :         nr = folio_nr_pages(folio);
     182             : 
     183           0 :         __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
     184           0 :         if (folio_test_swapbacked(folio)) {
     185           0 :                 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
     186           0 :                 if (folio_test_pmd_mappable(folio))
     187             :                         __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
     188             :         } else if (folio_test_pmd_mappable(folio)) {
     189             :                 __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
     190             :                 filemap_nr_thps_dec(mapping);
     191             :         }
     192             : 
     193             :         /*
     194             :          * At this point folio must be either written or cleaned by
     195             :          * truncate.  Dirty folio here signals a bug and loss of
     196             :          * unwritten data - on ordinary filesystems.
     197             :          *
     198             :          * But it's harmless on in-memory filesystems like tmpfs; and can
     199             :          * occur when a driver which did get_user_pages() sets page dirty
     200             :          * before putting it, while the inode is being finally evicted.
     201             :          *
     202             :          * Below fixes dirty accounting after removing the folio entirely
     203             :          * but leaves the dirty flag set: it has no effect for truncated
     204             :          * folio and anyway will be cleared before returning folio to
     205             :          * buddy allocator.
     206             :          */
     207           0 :         if (WARN_ON_ONCE(folio_test_dirty(folio) &&
     208             :                          mapping_can_writeback(mapping)))
     209           0 :                 folio_account_cleaned(folio, inode_to_wb(mapping->host));
     210             : }
     211             : 
     212             : /*
     213             :  * Delete a page from the page cache and free it. Caller has to make
     214             :  * sure the page is locked and that nobody else uses it - or that usage
     215             :  * is safe.  The caller must hold the i_pages lock.
     216             :  */
     217           0 : void __filemap_remove_folio(struct folio *folio, void *shadow)
     218             : {
     219           0 :         struct address_space *mapping = folio->mapping;
     220             : 
     221           0 :         trace_mm_filemap_delete_from_page_cache(folio);
     222           0 :         filemap_unaccount_folio(mapping, folio);
     223           0 :         page_cache_delete(mapping, folio, shadow);
     224           0 : }
     225             : 
     226           0 : void filemap_free_folio(struct address_space *mapping, struct folio *folio)
     227             : {
     228             :         void (*freepage)(struct page *);
     229           0 :         int refs = 1;
     230             : 
     231           0 :         freepage = mapping->a_ops->freepage;
     232           0 :         if (freepage)
     233           0 :                 freepage(&folio->page);
     234             : 
     235           0 :         if (folio_test_large(folio) && !folio_test_hugetlb(folio))
     236           0 :                 refs = folio_nr_pages(folio);
     237           0 :         folio_put_refs(folio, refs);
     238           0 : }
     239             : 
     240             : /**
     241             :  * filemap_remove_folio - Remove folio from page cache.
     242             :  * @folio: The folio.
     243             :  *
     244             :  * This must be called only on folios that are locked and have been
     245             :  * verified to be in the page cache.  It will never put the folio into
     246             :  * the free list because the caller has a reference on the page.
     247             :  */
     248           0 : void filemap_remove_folio(struct folio *folio)
     249             : {
     250           0 :         struct address_space *mapping = folio->mapping;
     251             : 
     252           0 :         BUG_ON(!folio_test_locked(folio));
     253           0 :         spin_lock(&mapping->host->i_lock);
     254           0 :         xa_lock_irq(&mapping->i_pages);
     255           0 :         __filemap_remove_folio(folio, NULL);
     256           0 :         xa_unlock_irq(&mapping->i_pages);
     257           0 :         if (mapping_shrinkable(mapping))
     258           0 :                 inode_add_lru(mapping->host);
     259           0 :         spin_unlock(&mapping->host->i_lock);
     260             : 
     261           0 :         filemap_free_folio(mapping, folio);
     262           0 : }
     263             : 
     264             : /*
     265             :  * page_cache_delete_batch - delete several folios from page cache
     266             :  * @mapping: the mapping to which folios belong
     267             :  * @fbatch: batch of folios to delete
     268             :  *
     269             :  * The function walks over mapping->i_pages and removes folios passed in
     270             :  * @fbatch from the mapping. The function expects @fbatch to be sorted
     271             :  * by page index and is optimised for it to be dense.
     272             :  * It tolerates holes in @fbatch (mapping entries at those indices are not
     273             :  * modified).
     274             :  *
     275             :  * The function expects the i_pages lock to be held.
     276             :  */
     277           0 : static void page_cache_delete_batch(struct address_space *mapping,
     278             :                              struct folio_batch *fbatch)
     279             : {
     280           0 :         XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
     281           0 :         long total_pages = 0;
     282           0 :         int i = 0;
     283             :         struct folio *folio;
     284             : 
     285           0 :         mapping_set_update(&xas, mapping);
     286           0 :         xas_for_each(&xas, folio, ULONG_MAX) {
     287           0 :                 if (i >= folio_batch_count(fbatch))
     288             :                         break;
     289             : 
     290             :                 /* A swap/dax/shadow entry got inserted? Skip it. */
     291           0 :                 if (xa_is_value(folio))
     292           0 :                         continue;
     293             :                 /*
     294             :                  * A page got inserted in our range? Skip it. We have our
     295             :                  * pages locked so they are protected from being removed.
     296             :                  * If we see a page whose index is higher than ours, it
     297             :                  * means our page has been removed, which shouldn't be
     298             :                  * possible because we're holding the PageLock.
     299             :                  */
     300           0 :                 if (folio != fbatch->folios[i]) {
     301             :                         VM_BUG_ON_FOLIO(folio->index >
     302             :                                         fbatch->folios[i]->index, folio);
     303           0 :                         continue;
     304             :                 }
     305             : 
     306           0 :                 WARN_ON_ONCE(!folio_test_locked(folio));
     307             : 
     308           0 :                 folio->mapping = NULL;
     309             :                 /* Leave folio->index set: truncation lookup relies on it */
     310             : 
     311           0 :                 i++;
     312           0 :                 xas_store(&xas, NULL);
     313           0 :                 total_pages += folio_nr_pages(folio);
     314             :         }
     315           0 :         mapping->nrpages -= total_pages;
     316           0 : }
     317             : 
     318           0 : void delete_from_page_cache_batch(struct address_space *mapping,
     319             :                                   struct folio_batch *fbatch)
     320             : {
     321             :         int i;
     322             : 
     323           0 :         if (!folio_batch_count(fbatch))
     324             :                 return;
     325             : 
     326           0 :         spin_lock(&mapping->host->i_lock);
     327           0 :         xa_lock_irq(&mapping->i_pages);
     328           0 :         for (i = 0; i < folio_batch_count(fbatch); i++) {
     329           0 :                 struct folio *folio = fbatch->folios[i];
     330             : 
     331           0 :                 trace_mm_filemap_delete_from_page_cache(folio);
     332           0 :                 filemap_unaccount_folio(mapping, folio);
     333             :         }
     334           0 :         page_cache_delete_batch(mapping, fbatch);
     335           0 :         xa_unlock_irq(&mapping->i_pages);
     336           0 :         if (mapping_shrinkable(mapping))
     337           0 :                 inode_add_lru(mapping->host);
     338           0 :         spin_unlock(&mapping->host->i_lock);
     339             : 
     340           0 :         for (i = 0; i < folio_batch_count(fbatch); i++)
     341           0 :                 filemap_free_folio(mapping, fbatch->folios[i]);
     342             : }
     343             : 
     344           0 : int filemap_check_errors(struct address_space *mapping)
     345             : {
     346           0 :         int ret = 0;
     347             :         /* Check for outstanding write errors */
     348           0 :         if (test_bit(AS_ENOSPC, &mapping->flags) &&
     349           0 :             test_and_clear_bit(AS_ENOSPC, &mapping->flags))
     350           0 :                 ret = -ENOSPC;
     351           0 :         if (test_bit(AS_EIO, &mapping->flags) &&
     352           0 :             test_and_clear_bit(AS_EIO, &mapping->flags))
     353           0 :                 ret = -EIO;
     354           0 :         return ret;
     355             : }
     356             : EXPORT_SYMBOL(filemap_check_errors);
     357             : 
     358             : static int filemap_check_and_keep_errors(struct address_space *mapping)
     359             : {
     360             :         /* Check for outstanding write errors */
     361           0 :         if (test_bit(AS_EIO, &mapping->flags))
     362             :                 return -EIO;
     363           0 :         if (test_bit(AS_ENOSPC, &mapping->flags))
     364             :                 return -ENOSPC;
     365             :         return 0;
     366             : }
     367             : 
     368             : /**
     369             :  * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
     370             :  * @mapping:    address space structure to write
     371             :  * @wbc:        the writeback_control controlling the writeout
     372             :  *
     373             :  * Call writepages on the mapping using the provided wbc to control the
     374             :  * writeout.
     375             :  *
     376             :  * Return: %0 on success, negative error code otherwise.
     377             :  */
     378           0 : int filemap_fdatawrite_wbc(struct address_space *mapping,
     379             :                            struct writeback_control *wbc)
     380             : {
     381             :         int ret;
     382             : 
     383           0 :         if (!mapping_can_writeback(mapping) ||
     384           0 :             !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
     385             :                 return 0;
     386             : 
     387           0 :         wbc_attach_fdatawrite_inode(wbc, mapping->host);
     388           0 :         ret = do_writepages(mapping, wbc);
     389           0 :         wbc_detach_inode(wbc);
     390           0 :         return ret;
     391             : }
     392             : EXPORT_SYMBOL(filemap_fdatawrite_wbc);
     393             : 
     394             : /**
     395             :  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
     396             :  * @mapping:    address space structure to write
     397             :  * @start:      offset in bytes where the range starts
     398             :  * @end:        offset in bytes where the range ends (inclusive)
     399             :  * @sync_mode:  enable synchronous operation
     400             :  *
     401             :  * Start writeback against all of a mapping's dirty pages that lie
     402             :  * within the byte offsets <start, end> inclusive.
     403             :  *
     404             :  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
     405             :  * opposed to a regular memory cleansing writeback.  The difference between
     406             :  * these two operations is that if a dirty page/buffer is encountered, it must
     407             :  * be waited upon, and not just skipped over.
     408             :  *
     409             :  * Return: %0 on success, negative error code otherwise.
     410             :  */
     411           0 : int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
     412             :                                 loff_t end, int sync_mode)
     413             : {
     414           0 :         struct writeback_control wbc = {
     415             :                 .sync_mode = sync_mode,
     416             :                 .nr_to_write = LONG_MAX,
     417             :                 .range_start = start,
     418             :                 .range_end = end,
     419             :         };
     420             : 
     421           0 :         return filemap_fdatawrite_wbc(mapping, &wbc);
     422             : }
     423             : 
     424             : static inline int __filemap_fdatawrite(struct address_space *mapping,
     425             :         int sync_mode)
     426             : {
     427           0 :         return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
     428             : }
     429             : 
     430           0 : int filemap_fdatawrite(struct address_space *mapping)
     431             : {
     432           0 :         return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
     433             : }
     434             : EXPORT_SYMBOL(filemap_fdatawrite);
     435             : 
     436           0 : int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
     437             :                                 loff_t end)
     438             : {
     439           0 :         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
     440             : }
     441             : EXPORT_SYMBOL(filemap_fdatawrite_range);
     442             : 
     443             : /**
     444             :  * filemap_flush - mostly a non-blocking flush
     445             :  * @mapping:    target address_space
     446             :  *
     447             :  * This is a mostly non-blocking flush.  Not suitable for data-integrity
     448             :  * purposes - I/O may not be started against all dirty pages.
     449             :  *
     450             :  * Return: %0 on success, negative error code otherwise.
     451             :  */
     452           0 : int filemap_flush(struct address_space *mapping)
     453             : {
     454           0 :         return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
     455             : }
     456             : EXPORT_SYMBOL(filemap_flush);
     457             : 
     458             : /**
     459             :  * filemap_range_has_page - check if a page exists in range.
     460             :  * @mapping:           address space within which to check
     461             :  * @start_byte:        offset in bytes where the range starts
     462             :  * @end_byte:          offset in bytes where the range ends (inclusive)
     463             :  *
     464             :  * Find at least one page in the range supplied, usually used to check if
     465             :  * direct writing in this range will trigger a writeback.
     466             :  *
     467             :  * Return: %true if at least one page exists in the specified range,
     468             :  * %false otherwise.
     469             :  */
     470           0 : bool filemap_range_has_page(struct address_space *mapping,
     471             :                            loff_t start_byte, loff_t end_byte)
     472             : {
     473             :         struct page *page;
     474           0 :         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
     475           0 :         pgoff_t max = end_byte >> PAGE_SHIFT;
     476             : 
     477           0 :         if (end_byte < start_byte)
     478             :                 return false;
     479             : 
     480             :         rcu_read_lock();
     481             :         for (;;) {
     482           0 :                 page = xas_find(&xas, max);
     483           0 :                 if (xas_retry(&xas, page))
     484           0 :                         continue;
     485             :                 /* Shadow entries don't count */
     486           0 :                 if (xa_is_value(page))
     487           0 :                         continue;
     488             :                 /*
     489             :                  * We don't need to try to pin this page; we're about to
     490             :                  * release the RCU lock anyway.  It is enough to know that
     491             :                  * there was a page here recently.
     492             :                  */
     493             :                 break;
     494             :         }
     495             :         rcu_read_unlock();
     496             : 
     497           0 :         return page != NULL;
     498             : }
     499             : EXPORT_SYMBOL(filemap_range_has_page);
     500             : 
     501           0 : static void __filemap_fdatawait_range(struct address_space *mapping,
     502             :                                      loff_t start_byte, loff_t end_byte)
     503             : {
     504           0 :         pgoff_t index = start_byte >> PAGE_SHIFT;
     505           0 :         pgoff_t end = end_byte >> PAGE_SHIFT;
     506             :         struct pagevec pvec;
     507             :         int nr_pages;
     508             : 
     509           0 :         if (end_byte < start_byte)
     510           0 :                 return;
     511             : 
     512           0 :         pagevec_init(&pvec);
     513           0 :         while (index <= end) {
     514             :                 unsigned i;
     515             : 
     516           0 :                 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
     517             :                                 end, PAGECACHE_TAG_WRITEBACK);
     518           0 :                 if (!nr_pages)
     519             :                         break;
     520             : 
     521           0 :                 for (i = 0; i < nr_pages; i++) {
     522           0 :                         struct page *page = pvec.pages[i];
     523             : 
     524           0 :                         wait_on_page_writeback(page);
     525           0 :                         ClearPageError(page);
     526             :                 }
     527           0 :                 pagevec_release(&pvec);
     528           0 :                 cond_resched();
     529             :         }
     530             : }
     531             : 
     532             : /**
     533             :  * filemap_fdatawait_range - wait for writeback to complete
     534             :  * @mapping:            address space structure to wait for
     535             :  * @start_byte:         offset in bytes where the range starts
     536             :  * @end_byte:           offset in bytes where the range ends (inclusive)
     537             :  *
     538             :  * Walk the list of under-writeback pages of the given address space
     539             :  * in the given range and wait for all of them.  Check error status of
     540             :  * the address space and return it.
     541             :  *
     542             :  * Since the error status of the address space is cleared by this function,
     543             :  * callers are responsible for checking the return value and handling and/or
     544             :  * reporting the error.
     545             :  *
     546             :  * Return: error status of the address space.
     547             :  */
     548           0 : int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
     549             :                             loff_t end_byte)
     550             : {
     551           0 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     552           0 :         return filemap_check_errors(mapping);
     553             : }
     554             : EXPORT_SYMBOL(filemap_fdatawait_range);
     555             : 
     556             : /**
     557             :  * filemap_fdatawait_range_keep_errors - wait for writeback to complete
     558             :  * @mapping:            address space structure to wait for
     559             :  * @start_byte:         offset in bytes where the range starts
     560             :  * @end_byte:           offset in bytes where the range ends (inclusive)
     561             :  *
     562             :  * Walk the list of under-writeback pages of the given address space in the
     563             :  * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
     564             :  * this function does not clear error status of the address space.
     565             :  *
     566             :  * Use this function if callers don't handle errors themselves.  Expected
     567             :  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
     568             :  * fsfreeze(8)
     569             :  */
     570           0 : int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
     571             :                 loff_t start_byte, loff_t end_byte)
     572             : {
     573           0 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     574           0 :         return filemap_check_and_keep_errors(mapping);
     575             : }
     576             : EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
     577             : 
     578             : /**
     579             :  * file_fdatawait_range - wait for writeback to complete
     580             :  * @file:               file pointing to address space structure to wait for
     581             :  * @start_byte:         offset in bytes where the range starts
     582             :  * @end_byte:           offset in bytes where the range ends (inclusive)
     583             :  *
     584             :  * Walk the list of under-writeback pages of the address space that file
     585             :  * refers to, in the given range and wait for all of them.  Check error
     586             :  * status of the address space vs. the file->f_wb_err cursor and return it.
     587             :  *
     588             :  * Since the error status of the file is advanced by this function,
     589             :  * callers are responsible for checking the return value and handling and/or
     590             :  * reporting the error.
     591             :  *
     592             :  * Return: error status of the address space vs. the file->f_wb_err cursor.
     593             :  */
     594           0 : int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
     595             : {
     596           0 :         struct address_space *mapping = file->f_mapping;
     597             : 
     598           0 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     599           0 :         return file_check_and_advance_wb_err(file);
     600             : }
     601             : EXPORT_SYMBOL(file_fdatawait_range);
     602             : 
     603             : /**
     604             :  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
     605             :  * @mapping: address space structure to wait for
     606             :  *
     607             :  * Walk the list of under-writeback pages of the given address space
     608             :  * and wait for all of them.  Unlike filemap_fdatawait(), this function
     609             :  * does not clear error status of the address space.
     610             :  *
     611             :  * Use this function if callers don't handle errors themselves.  Expected
     612             :  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
     613             :  * fsfreeze(8)
     614             :  *
     615             :  * Return: error status of the address space.
     616             :  */
     617           0 : int filemap_fdatawait_keep_errors(struct address_space *mapping)
     618             : {
     619           0 :         __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
     620           0 :         return filemap_check_and_keep_errors(mapping);
     621             : }
     622             : EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
     623             : 
     624             : /* Returns true if writeback might be needed or already in progress. */
     625             : static bool mapping_needs_writeback(struct address_space *mapping)
     626             : {
     627             :         return mapping->nrpages;
     628             : }
     629             : 
     630           0 : bool filemap_range_has_writeback(struct address_space *mapping,
     631             :                                  loff_t start_byte, loff_t end_byte)
     632             : {
     633           0 :         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
     634           0 :         pgoff_t max = end_byte >> PAGE_SHIFT;
     635             :         struct page *page;
     636             : 
     637           0 :         if (end_byte < start_byte)
     638             :                 return false;
     639             : 
     640             :         rcu_read_lock();
     641           0 :         xas_for_each(&xas, page, max) {
     642           0 :                 if (xas_retry(&xas, page))
     643           0 :                         continue;
     644           0 :                 if (xa_is_value(page))
     645           0 :                         continue;
     646           0 :                 if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
     647             :                         break;
     648             :         }
     649             :         rcu_read_unlock();
     650           0 :         return page != NULL;
     651             : }
     652             : EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
     653             : 
     654             : /**
     655             :  * filemap_write_and_wait_range - write out & wait on a file range
     656             :  * @mapping:    the address_space for the pages
     657             :  * @lstart:     offset in bytes where the range starts
     658             :  * @lend:       offset in bytes where the range ends (inclusive)
     659             :  *
     660             :  * Write out and wait upon file offsets lstart->lend, inclusive.
     661             :  *
     662             :  * Note that @lend is inclusive (describes the last byte to be written) so
     663             :  * that this function can be used to write to the very end-of-file (end = -1).
     664             :  *
     665             :  * Return: error status of the address space.
     666             :  */
     667           0 : int filemap_write_and_wait_range(struct address_space *mapping,
     668             :                                  loff_t lstart, loff_t lend)
     669             : {
     670           0 :         int err = 0;
     671             : 
     672           0 :         if (mapping_needs_writeback(mapping)) {
     673           0 :                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
     674             :                                                  WB_SYNC_ALL);
     675             :                 /*
     676             :                  * Even if the above returned error, the pages may be
     677             :                  * written partially (e.g. -ENOSPC), so we wait for it.
     678             :                  * But the -EIO is special case, it may indicate the worst
     679             :                  * thing (e.g. bug) happened, so we avoid waiting for it.
     680             :                  */
     681           0 :                 if (err != -EIO) {
     682           0 :                         int err2 = filemap_fdatawait_range(mapping,
     683             :                                                 lstart, lend);
     684           0 :                         if (!err)
     685           0 :                                 err = err2;
     686             :                 } else {
     687             :                         /* Clear any previously stored errors */
     688           0 :                         filemap_check_errors(mapping);
     689             :                 }
     690             :         } else {
     691           0 :                 err = filemap_check_errors(mapping);
     692             :         }
     693           0 :         return err;
     694             : }
     695             : EXPORT_SYMBOL(filemap_write_and_wait_range);
     696             : 
     697           0 : void __filemap_set_wb_err(struct address_space *mapping, int err)
     698             : {
     699           0 :         errseq_t eseq = errseq_set(&mapping->wb_err, err);
     700             : 
     701           0 :         trace_filemap_set_wb_err(mapping, eseq);
     702           0 : }
     703             : EXPORT_SYMBOL(__filemap_set_wb_err);
     704             : 
     705             : /**
     706             :  * file_check_and_advance_wb_err - report wb error (if any) that was previously
     707             :  *                                 and advance wb_err to current one
     708             :  * @file: struct file on which the error is being reported
     709             :  *
     710             :  * When userland calls fsync (or something like nfsd does the equivalent), we
     711             :  * want to report any writeback errors that occurred since the last fsync (or
     712             :  * since the file was opened if there haven't been any).
     713             :  *
     714             :  * Grab the wb_err from the mapping. If it matches what we have in the file,
     715             :  * then just quickly return 0. The file is all caught up.
     716             :  *
     717             :  * If it doesn't match, then take the mapping value, set the "seen" flag in
     718             :  * it and try to swap it into place. If it works, or another task beat us
     719             :  * to it with the new value, then update the f_wb_err and return the error
     720             :  * portion. The error at this point must be reported via proper channels
     721             :  * (a'la fsync, or NFS COMMIT operation, etc.).
     722             :  *
     723             :  * While we handle mapping->wb_err with atomic operations, the f_wb_err
     724             :  * value is protected by the f_lock since we must ensure that it reflects
     725             :  * the latest value swapped in for this file descriptor.
     726             :  *
     727             :  * Return: %0 on success, negative error code otherwise.
     728             :  */
     729           0 : int file_check_and_advance_wb_err(struct file *file)
     730             : {
     731           0 :         int err = 0;
     732           0 :         errseq_t old = READ_ONCE(file->f_wb_err);
     733           0 :         struct address_space *mapping = file->f_mapping;
     734             : 
     735             :         /* Locklessly handle the common case where nothing has changed */
     736           0 :         if (errseq_check(&mapping->wb_err, old)) {
     737             :                 /* Something changed, must use slow path */
     738           0 :                 spin_lock(&file->f_lock);
     739           0 :                 old = file->f_wb_err;
     740           0 :                 err = errseq_check_and_advance(&mapping->wb_err,
     741             :                                                 &file->f_wb_err);
     742           0 :                 trace_file_check_and_advance_wb_err(file, old);
     743           0 :                 spin_unlock(&file->f_lock);
     744             :         }
     745             : 
     746             :         /*
     747             :          * We're mostly using this function as a drop in replacement for
     748             :          * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
     749             :          * that the legacy code would have had on these flags.
     750             :          */
     751           0 :         clear_bit(AS_EIO, &mapping->flags);
     752           0 :         clear_bit(AS_ENOSPC, &mapping->flags);
     753           0 :         return err;
     754             : }
     755             : EXPORT_SYMBOL(file_check_and_advance_wb_err);
     756             : 
     757             : /**
     758             :  * file_write_and_wait_range - write out & wait on a file range
     759             :  * @file:       file pointing to address_space with pages
     760             :  * @lstart:     offset in bytes where the range starts
     761             :  * @lend:       offset in bytes where the range ends (inclusive)
     762             :  *
     763             :  * Write out and wait upon file offsets lstart->lend, inclusive.
     764             :  *
     765             :  * Note that @lend is inclusive (describes the last byte to be written) so
     766             :  * that this function can be used to write to the very end-of-file (end = -1).
     767             :  *
     768             :  * After writing out and waiting on the data, we check and advance the
     769             :  * f_wb_err cursor to the latest value, and return any errors detected there.
     770             :  *
     771             :  * Return: %0 on success, negative error code otherwise.
     772             :  */
     773           0 : int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
     774             : {
     775           0 :         int err = 0, err2;
     776           0 :         struct address_space *mapping = file->f_mapping;
     777             : 
     778           0 :         if (mapping_needs_writeback(mapping)) {
     779           0 :                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
     780             :                                                  WB_SYNC_ALL);
     781             :                 /* See comment of filemap_write_and_wait() */
     782           0 :                 if (err != -EIO)
     783           0 :                         __filemap_fdatawait_range(mapping, lstart, lend);
     784             :         }
     785           0 :         err2 = file_check_and_advance_wb_err(file);
     786           0 :         if (!err)
     787           0 :                 err = err2;
     788           0 :         return err;
     789             : }
     790             : EXPORT_SYMBOL(file_write_and_wait_range);
     791             : 
     792             : /**
     793             :  * replace_page_cache_page - replace a pagecache page with a new one
     794             :  * @old:        page to be replaced
     795             :  * @new:        page to replace with
     796             :  *
     797             :  * This function replaces a page in the pagecache with a new one.  On
     798             :  * success it acquires the pagecache reference for the new page and
     799             :  * drops it for the old page.  Both the old and new pages must be
     800             :  * locked.  This function does not add the new page to the LRU, the
     801             :  * caller must do that.
     802             :  *
     803             :  * The remove + add is atomic.  This function cannot fail.
     804             :  */
     805           0 : void replace_page_cache_page(struct page *old, struct page *new)
     806             : {
     807           0 :         struct folio *fold = page_folio(old);
     808           0 :         struct folio *fnew = page_folio(new);
     809           0 :         struct address_space *mapping = old->mapping;
     810           0 :         void (*freepage)(struct page *) = mapping->a_ops->freepage;
     811           0 :         pgoff_t offset = old->index;
     812           0 :         XA_STATE(xas, &mapping->i_pages, offset);
     813             : 
     814             :         VM_BUG_ON_PAGE(!PageLocked(old), old);
     815             :         VM_BUG_ON_PAGE(!PageLocked(new), new);
     816             :         VM_BUG_ON_PAGE(new->mapping, new);
     817             : 
     818           0 :         get_page(new);
     819           0 :         new->mapping = mapping;
     820           0 :         new->index = offset;
     821             : 
     822           0 :         mem_cgroup_migrate(fold, fnew);
     823             : 
     824           0 :         xas_lock_irq(&xas);
     825           0 :         xas_store(&xas, new);
     826             : 
     827           0 :         old->mapping = NULL;
     828             :         /* hugetlb pages do not participate in page cache accounting. */
     829           0 :         if (!PageHuge(old))
     830           0 :                 __dec_lruvec_page_state(old, NR_FILE_PAGES);
     831           0 :         if (!PageHuge(new))
     832           0 :                 __inc_lruvec_page_state(new, NR_FILE_PAGES);
     833           0 :         if (PageSwapBacked(old))
     834           0 :                 __dec_lruvec_page_state(old, NR_SHMEM);
     835           0 :         if (PageSwapBacked(new))
     836           0 :                 __inc_lruvec_page_state(new, NR_SHMEM);
     837           0 :         xas_unlock_irq(&xas);
     838           0 :         if (freepage)
     839           0 :                 freepage(old);
     840           0 :         put_page(old);
     841           0 : }
     842             : EXPORT_SYMBOL_GPL(replace_page_cache_page);
     843             : 
     844           0 : noinline int __filemap_add_folio(struct address_space *mapping,
     845             :                 struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
     846             : {
     847           0 :         XA_STATE(xas, &mapping->i_pages, index);
     848           0 :         int huge = folio_test_hugetlb(folio);
     849           0 :         bool charged = false;
     850           0 :         long nr = 1;
     851             : 
     852             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
     853             :         VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
     854           0 :         mapping_set_update(&xas, mapping);
     855             : 
     856             :         if (!huge) {
     857           0 :                 int error = mem_cgroup_charge(folio, NULL, gfp);
     858             :                 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
     859             :                 if (error)
     860             :                         return error;
     861           0 :                 charged = true;
     862           0 :                 xas_set_order(&xas, index, folio_order(folio));
     863           0 :                 nr = folio_nr_pages(folio);
     864             :         }
     865             : 
     866           0 :         gfp &= GFP_RECLAIM_MASK;
     867           0 :         folio_ref_add(folio, nr);
     868           0 :         folio->mapping = mapping;
     869           0 :         folio->index = xas.xa_index;
     870             : 
     871             :         do {
     872           0 :                 unsigned int order = xa_get_order(xas.xa, xas.xa_index);
     873           0 :                 void *entry, *old = NULL;
     874             : 
     875           0 :                 if (order > folio_order(folio))
     876             :                         xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
     877             :                                         order, gfp);
     878           0 :                 xas_lock_irq(&xas);
     879           0 :                 xas_for_each_conflict(&xas, entry) {
     880           0 :                         old = entry;
     881           0 :                         if (!xa_is_value(entry)) {
     882           0 :                                 xas_set_err(&xas, -EEXIST);
     883             :                                 goto unlock;
     884             :                         }
     885             :                 }
     886             : 
     887           0 :                 if (old) {
     888           0 :                         if (shadowp)
     889           0 :                                 *shadowp = old;
     890             :                         /* entry may have been split before we acquired lock */
     891           0 :                         order = xa_get_order(xas.xa, xas.xa_index);
     892             :                         if (order > folio_order(folio)) {
     893             :                                 /* How to handle large swap entries? */
     894             :                                 BUG_ON(shmem_mapping(mapping));
     895             :                                 xas_split(&xas, old, order);
     896             :                                 xas_reset(&xas);
     897             :                         }
     898             :                 }
     899             : 
     900           0 :                 xas_store(&xas, folio);
     901           0 :                 if (xas_error(&xas))
     902             :                         goto unlock;
     903             : 
     904           0 :                 mapping->nrpages += nr;
     905             : 
     906             :                 /* hugetlb pages do not participate in page cache accounting */
     907             :                 if (!huge) {
     908           0 :                         __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
     909           0 :                         if (folio_test_pmd_mappable(folio))
     910             :                                 __lruvec_stat_mod_folio(folio,
     911             :                                                 NR_FILE_THPS, nr);
     912             :                 }
     913             : unlock:
     914           0 :                 xas_unlock_irq(&xas);
     915           0 :         } while (xas_nomem(&xas, gfp));
     916             : 
     917           0 :         if (xas_error(&xas))
     918             :                 goto error;
     919             : 
     920             :         trace_mm_filemap_add_to_page_cache(folio);
     921             :         return 0;
     922             : error:
     923             :         if (charged)
     924             :                 mem_cgroup_uncharge(folio);
     925           0 :         folio->mapping = NULL;
     926             :         /* Leave page->index set: truncation relies upon it */
     927           0 :         folio_put_refs(folio, nr);
     928           0 :         return xas_error(&xas);
     929             : }
     930             : ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
     931             : 
     932             : /**
     933             :  * add_to_page_cache_locked - add a locked page to the pagecache
     934             :  * @page:       page to add
     935             :  * @mapping:    the page's address_space
     936             :  * @offset:     page index
     937             :  * @gfp_mask:   page allocation mode
     938             :  *
     939             :  * This function is used to add a page to the pagecache. It must be locked.
     940             :  * This function does not add the page to the LRU.  The caller must do that.
     941             :  *
     942             :  * Return: %0 on success, negative error code otherwise.
     943             :  */
     944           0 : int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
     945             :                 pgoff_t offset, gfp_t gfp_mask)
     946             : {
     947           0 :         return __filemap_add_folio(mapping, page_folio(page), offset,
     948             :                                           gfp_mask, NULL);
     949             : }
     950             : EXPORT_SYMBOL(add_to_page_cache_locked);
     951             : 
     952           0 : int filemap_add_folio(struct address_space *mapping, struct folio *folio,
     953             :                                 pgoff_t index, gfp_t gfp)
     954             : {
     955           0 :         void *shadow = NULL;
     956             :         int ret;
     957             : 
     958           0 :         __folio_set_locked(folio);
     959           0 :         ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
     960           0 :         if (unlikely(ret))
     961             :                 __folio_clear_locked(folio);
     962             :         else {
     963             :                 /*
     964             :                  * The folio might have been evicted from cache only
     965             :                  * recently, in which case it should be activated like
     966             :                  * any other repeatedly accessed folio.
     967             :                  * The exception is folios getting rewritten; evicting other
     968             :                  * data from the working set, only to cache data that will
     969             :                  * get overwritten with something else, is a waste of memory.
     970             :                  */
     971           0 :                 WARN_ON_ONCE(folio_test_active(folio));
     972           0 :                 if (!(gfp & __GFP_WRITE) && shadow)
     973           0 :                         workingset_refault(folio, shadow);
     974           0 :                 folio_add_lru(folio);
     975             :         }
     976           0 :         return ret;
     977             : }
     978             : EXPORT_SYMBOL_GPL(filemap_add_folio);
     979             : 
     980             : #ifdef CONFIG_NUMA
     981             : struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
     982             : {
     983             :         int n;
     984             :         struct folio *folio;
     985             : 
     986             :         if (cpuset_do_page_mem_spread()) {
     987             :                 unsigned int cpuset_mems_cookie;
     988             :                 do {
     989             :                         cpuset_mems_cookie = read_mems_allowed_begin();
     990             :                         n = cpuset_mem_spread_node();
     991             :                         folio = __folio_alloc_node(gfp, order, n);
     992             :                 } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
     993             : 
     994             :                 return folio;
     995             :         }
     996             :         return folio_alloc(gfp, order);
     997             : }
     998             : EXPORT_SYMBOL(filemap_alloc_folio);
     999             : #endif
    1000             : 
    1001             : /*
    1002             :  * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
    1003             :  *
    1004             :  * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
    1005             :  *
    1006             :  * @mapping1: the first mapping to lock
    1007             :  * @mapping2: the second mapping to lock
    1008             :  */
    1009           0 : void filemap_invalidate_lock_two(struct address_space *mapping1,
    1010             :                                  struct address_space *mapping2)
    1011             : {
    1012           0 :         if (mapping1 > mapping2)
    1013           0 :                 swap(mapping1, mapping2);
    1014           0 :         if (mapping1)
    1015           0 :                 down_write(&mapping1->invalidate_lock);
    1016           0 :         if (mapping2 && mapping1 != mapping2)
    1017           0 :                 down_write_nested(&mapping2->invalidate_lock, 1);
    1018           0 : }
    1019             : EXPORT_SYMBOL(filemap_invalidate_lock_two);
    1020             : 
    1021             : /*
    1022             :  * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
    1023             :  *
    1024             :  * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
    1025             :  *
    1026             :  * @mapping1: the first mapping to unlock
    1027             :  * @mapping2: the second mapping to unlock
    1028             :  */
    1029           0 : void filemap_invalidate_unlock_two(struct address_space *mapping1,
    1030             :                                    struct address_space *mapping2)
    1031             : {
    1032           0 :         if (mapping1)
    1033           0 :                 up_write(&mapping1->invalidate_lock);
    1034           0 :         if (mapping2 && mapping1 != mapping2)
    1035           0 :                 up_write(&mapping2->invalidate_lock);
    1036           0 : }
    1037             : EXPORT_SYMBOL(filemap_invalidate_unlock_two);
    1038             : 
    1039             : /*
    1040             :  * In order to wait for pages to become available there must be
    1041             :  * waitqueues associated with pages. By using a hash table of
    1042             :  * waitqueues where the bucket discipline is to maintain all
    1043             :  * waiters on the same queue and wake all when any of the pages
    1044             :  * become available, and for the woken contexts to check to be
    1045             :  * sure the appropriate page became available, this saves space
    1046             :  * at a cost of "thundering herd" phenomena during rare hash
    1047             :  * collisions.
    1048             :  */
    1049             : #define PAGE_WAIT_TABLE_BITS 8
    1050             : #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
    1051             : static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
    1052             : 
    1053             : static wait_queue_head_t *folio_waitqueue(struct folio *folio)
    1054             : {
    1055           0 :         return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
    1056             : }
    1057             : 
    1058           1 : void __init pagecache_init(void)
    1059             : {
    1060             :         int i;
    1061             : 
    1062         257 :         for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
    1063         256 :                 init_waitqueue_head(&folio_wait_table[i]);
    1064             : 
    1065           1 :         page_writeback_init();
    1066           1 : }
    1067             : 
    1068             : /*
    1069             :  * The page wait code treats the "wait->flags" somewhat unusually, because
    1070             :  * we have multiple different kinds of waits, not just the usual "exclusive"
    1071             :  * one.
    1072             :  *
    1073             :  * We have:
    1074             :  *
    1075             :  *  (a) no special bits set:
    1076             :  *
    1077             :  *      We're just waiting for the bit to be released, and when a waker
    1078             :  *      calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
    1079             :  *      and remove it from the wait queue.
    1080             :  *
    1081             :  *      Simple and straightforward.
    1082             :  *
    1083             :  *  (b) WQ_FLAG_EXCLUSIVE:
    1084             :  *
    1085             :  *      The waiter is waiting to get the lock, and only one waiter should
    1086             :  *      be woken up to avoid any thundering herd behavior. We'll set the
    1087             :  *      WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
    1088             :  *
    1089             :  *      This is the traditional exclusive wait.
    1090             :  *
    1091             :  *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
    1092             :  *
    1093             :  *      The waiter is waiting to get the bit, and additionally wants the
    1094             :  *      lock to be transferred to it for fair lock behavior. If the lock
    1095             :  *      cannot be taken, we stop walking the wait queue without waking
    1096             :  *      the waiter.
    1097             :  *
    1098             :  *      This is the "fair lock handoff" case, and in addition to setting
    1099             :  *      WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
    1100             :  *      that it now has the lock.
    1101             :  */
    1102           0 : static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
    1103             : {
    1104             :         unsigned int flags;
    1105           0 :         struct wait_page_key *key = arg;
    1106           0 :         struct wait_page_queue *wait_page
    1107           0 :                 = container_of(wait, struct wait_page_queue, wait);
    1108             : 
    1109           0 :         if (!wake_page_match(wait_page, key))
    1110             :                 return 0;
    1111             : 
    1112             :         /*
    1113             :          * If it's a lock handoff wait, we get the bit for it, and
    1114             :          * stop walking (and do not wake it up) if we can't.
    1115             :          */
    1116           0 :         flags = wait->flags;
    1117           0 :         if (flags & WQ_FLAG_EXCLUSIVE) {
    1118           0 :                 if (test_bit(key->bit_nr, &key->folio->flags))
    1119             :                         return -1;
    1120           0 :                 if (flags & WQ_FLAG_CUSTOM) {
    1121           0 :                         if (test_and_set_bit(key->bit_nr, &key->folio->flags))
    1122             :                                 return -1;
    1123           0 :                         flags |= WQ_FLAG_DONE;
    1124             :                 }
    1125             :         }
    1126             : 
    1127             :         /*
    1128             :          * We are holding the wait-queue lock, but the waiter that
    1129             :          * is waiting for this will be checking the flags without
    1130             :          * any locking.
    1131             :          *
    1132             :          * So update the flags atomically, and wake up the waiter
    1133             :          * afterwards to avoid any races. This store-release pairs
    1134             :          * with the load-acquire in folio_wait_bit_common().
    1135             :          */
    1136           0 :         smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
    1137           0 :         wake_up_state(wait->private, mode);
    1138             : 
    1139             :         /*
    1140             :          * Ok, we have successfully done what we're waiting for,
    1141             :          * and we can unconditionally remove the wait entry.
    1142             :          *
    1143             :          * Note that this pairs with the "finish_wait()" in the
    1144             :          * waiter, and has to be the absolute last thing we do.
    1145             :          * After this list_del_init(&wait->entry) the wait entry
    1146             :          * might be de-allocated and the process might even have
    1147             :          * exited.
    1148             :          */
    1149           0 :         list_del_init_careful(&wait->entry);
    1150           0 :         return (flags & WQ_FLAG_EXCLUSIVE) != 0;
    1151             : }
    1152             : 
    1153           0 : static void folio_wake_bit(struct folio *folio, int bit_nr)
    1154             : {
    1155           0 :         wait_queue_head_t *q = folio_waitqueue(folio);
    1156             :         struct wait_page_key key;
    1157             :         unsigned long flags;
    1158             :         wait_queue_entry_t bookmark;
    1159             : 
    1160           0 :         key.folio = folio;
    1161           0 :         key.bit_nr = bit_nr;
    1162           0 :         key.page_match = 0;
    1163             : 
    1164           0 :         bookmark.flags = 0;
    1165           0 :         bookmark.private = NULL;
    1166           0 :         bookmark.func = NULL;
    1167           0 :         INIT_LIST_HEAD(&bookmark.entry);
    1168             : 
    1169           0 :         spin_lock_irqsave(&q->lock, flags);
    1170           0 :         __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
    1171             : 
    1172           0 :         while (bookmark.flags & WQ_FLAG_BOOKMARK) {
    1173             :                 /*
    1174             :                  * Take a breather from holding the lock,
    1175             :                  * allow pages that finish wake up asynchronously
    1176             :                  * to acquire the lock and remove themselves
    1177             :                  * from wait queue
    1178             :                  */
    1179           0 :                 spin_unlock_irqrestore(&q->lock, flags);
    1180             :                 cpu_relax();
    1181           0 :                 spin_lock_irqsave(&q->lock, flags);
    1182           0 :                 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
    1183             :         }
    1184             : 
    1185             :         /*
    1186             :          * It's possible to miss clearing waiters here, when we woke our page
    1187             :          * waiters, but the hashed waitqueue has waiters for other pages on it.
    1188             :          * That's okay, it's a rare case. The next waker will clear it.
    1189             :          *
    1190             :          * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
    1191             :          * other), the flag may be cleared in the course of freeing the page;
    1192             :          * but that is not required for correctness.
    1193             :          */
    1194           0 :         if (!waitqueue_active(q) || !key.page_match)
    1195             :                 folio_clear_waiters(folio);
    1196             : 
    1197           0 :         spin_unlock_irqrestore(&q->lock, flags);
    1198           0 : }
    1199             : 
    1200             : static void folio_wake(struct folio *folio, int bit)
    1201             : {
    1202           0 :         if (!folio_test_waiters(folio))
    1203             :                 return;
    1204           0 :         folio_wake_bit(folio, bit);
    1205             : }
    1206             : 
    1207             : /*
    1208             :  * A choice of three behaviors for folio_wait_bit_common():
    1209             :  */
    1210             : enum behavior {
    1211             :         EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
    1212             :                          * __folio_lock() waiting on then setting PG_locked.
    1213             :                          */
    1214             :         SHARED,         /* Hold ref to page and check the bit when woken, like
    1215             :                          * folio_wait_writeback() waiting on PG_writeback.
    1216             :                          */
    1217             :         DROP,           /* Drop ref to page before wait, no check when woken,
    1218             :                          * like folio_put_wait_locked() on PG_locked.
    1219             :                          */
    1220             : };
    1221             : 
    1222             : /*
    1223             :  * Attempt to check (or get) the folio flag, and mark us done
    1224             :  * if successful.
    1225             :  */
    1226           0 : static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
    1227             :                                         struct wait_queue_entry *wait)
    1228             : {
    1229           0 :         if (wait->flags & WQ_FLAG_EXCLUSIVE) {
    1230           0 :                 if (test_and_set_bit(bit_nr, &folio->flags))
    1231             :                         return false;
    1232           0 :         } else if (test_bit(bit_nr, &folio->flags))
    1233             :                 return false;
    1234             : 
    1235           0 :         wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
    1236             :         return true;
    1237             : }
    1238             : 
    1239             : /* How many times do we accept lock stealing from under a waiter? */
    1240             : int sysctl_page_lock_unfairness = 5;
    1241             : 
    1242           0 : static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
    1243             :                 int state, enum behavior behavior)
    1244             : {
    1245           0 :         wait_queue_head_t *q = folio_waitqueue(folio);
    1246           0 :         int unfairness = sysctl_page_lock_unfairness;
    1247             :         struct wait_page_queue wait_page;
    1248           0 :         wait_queue_entry_t *wait = &wait_page.wait;
    1249           0 :         bool thrashing = false;
    1250           0 :         bool delayacct = false;
    1251             :         unsigned long pflags;
    1252             : 
    1253           0 :         if (bit_nr == PG_locked &&
    1254           0 :             !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
    1255           0 :                 if (!folio_test_swapbacked(folio)) {
    1256             :                         delayacct_thrashing_start();
    1257             :                         delayacct = true;
    1258             :                 }
    1259             :                 psi_memstall_enter(&pflags);
    1260             :                 thrashing = true;
    1261             :         }
    1262             : 
    1263           0 :         init_wait(wait);
    1264           0 :         wait->func = wake_page_function;
    1265           0 :         wait_page.folio = folio;
    1266           0 :         wait_page.bit_nr = bit_nr;
    1267             : 
    1268             : repeat:
    1269           0 :         wait->flags = 0;
    1270           0 :         if (behavior == EXCLUSIVE) {
    1271           0 :                 wait->flags = WQ_FLAG_EXCLUSIVE;
    1272           0 :                 if (--unfairness < 0)
    1273           0 :                         wait->flags |= WQ_FLAG_CUSTOM;
    1274             :         }
    1275             : 
    1276             :         /*
    1277             :          * Do one last check whether we can get the
    1278             :          * page bit synchronously.
    1279             :          *
    1280             :          * Do the folio_set_waiters() marking before that
    1281             :          * to let any waker we _just_ missed know they
    1282             :          * need to wake us up (otherwise they'll never
    1283             :          * even go to the slow case that looks at the
    1284             :          * page queue), and add ourselves to the wait
    1285             :          * queue if we need to sleep.
    1286             :          *
    1287             :          * This part needs to be done under the queue
    1288             :          * lock to avoid races.
    1289             :          */
    1290           0 :         spin_lock_irq(&q->lock);
    1291           0 :         folio_set_waiters(folio);
    1292           0 :         if (!folio_trylock_flag(folio, bit_nr, wait))
    1293             :                 __add_wait_queue_entry_tail(q, wait);
    1294           0 :         spin_unlock_irq(&q->lock);
    1295             : 
    1296             :         /*
    1297             :          * From now on, all the logic will be based on
    1298             :          * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
    1299             :          * see whether the page bit testing has already
    1300             :          * been done by the wake function.
    1301             :          *
    1302             :          * We can drop our reference to the folio.
    1303             :          */
    1304           0 :         if (behavior == DROP)
    1305             :                 folio_put(folio);
    1306             : 
    1307             :         /*
    1308             :          * Note that until the "finish_wait()", or until
    1309             :          * we see the WQ_FLAG_WOKEN flag, we need to
    1310             :          * be very careful with the 'wait->flags', because
    1311             :          * we may race with a waker that sets them.
    1312             :          */
    1313           0 :         for (;;) {
    1314             :                 unsigned int flags;
    1315             : 
    1316           0 :                 set_current_state(state);
    1317             : 
    1318             :                 /* Loop until we've been woken or interrupted */
    1319           0 :                 flags = smp_load_acquire(&wait->flags);
    1320           0 :                 if (!(flags & WQ_FLAG_WOKEN)) {
    1321           0 :                         if (signal_pending_state(state, current))
    1322             :                                 break;
    1323             : 
    1324           0 :                         io_schedule();
    1325           0 :                         continue;
    1326             :                 }
    1327             : 
    1328             :                 /* If we were non-exclusive, we're done */
    1329           0 :                 if (behavior != EXCLUSIVE)
    1330             :                         break;
    1331             : 
    1332             :                 /* If the waker got the lock for us, we're done */
    1333           0 :                 if (flags & WQ_FLAG_DONE)
    1334             :                         break;
    1335             : 
    1336             :                 /*
    1337             :                  * Otherwise, if we're getting the lock, we need to
    1338             :                  * try to get it ourselves.
    1339             :                  *
    1340             :                  * And if that fails, we'll have to retry this all.
    1341             :                  */
    1342           0 :                 if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
    1343             :                         goto repeat;
    1344             : 
    1345           0 :                 wait->flags |= WQ_FLAG_DONE;
    1346           0 :                 break;
    1347             :         }
    1348             : 
    1349             :         /*
    1350             :          * If a signal happened, this 'finish_wait()' may remove the last
    1351             :          * waiter from the wait-queues, but the folio waiters bit will remain
    1352             :          * set. That's ok. The next wakeup will take care of it, and trying
    1353             :          * to do it here would be difficult and prone to races.
    1354             :          */
    1355           0 :         finish_wait(q, wait);
    1356             : 
    1357             :         if (thrashing) {
    1358             :                 if (delayacct)
    1359             :                         delayacct_thrashing_end();
    1360             :                 psi_memstall_leave(&pflags);
    1361             :         }
    1362             : 
    1363             :         /*
    1364             :          * NOTE! The wait->flags weren't stable until we've done the
    1365             :          * 'finish_wait()', and we could have exited the loop above due
    1366             :          * to a signal, and had a wakeup event happen after the signal
    1367             :          * test but before the 'finish_wait()'.
    1368             :          *
    1369             :          * So only after the finish_wait() can we reliably determine
    1370             :          * if we got woken up or not, so we can now figure out the final
    1371             :          * return value based on that state without races.
    1372             :          *
    1373             :          * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
    1374             :          * waiter, but an exclusive one requires WQ_FLAG_DONE.
    1375             :          */
    1376           0 :         if (behavior == EXCLUSIVE)
    1377           0 :                 return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
    1378             : 
    1379           0 :         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
    1380             : }
    1381             : 
    1382             : #ifdef CONFIG_MIGRATION
    1383             : /**
    1384             :  * migration_entry_wait_on_locked - Wait for a migration entry to be removed
    1385             :  * @entry: migration swap entry.
    1386             :  * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
    1387             :  *        for pte entries, pass NULL for pmd entries.
    1388             :  * @ptl: already locked ptl. This function will drop the lock.
    1389             :  *
    1390             :  * Wait for a migration entry referencing the given page to be removed. This is
    1391             :  * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
    1392             :  * this can be called without taking a reference on the page. Instead this
    1393             :  * should be called while holding the ptl for the migration entry referencing
    1394             :  * the page.
    1395             :  *
    1396             :  * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
    1397             :  *
    1398             :  * This follows the same logic as folio_wait_bit_common() so see the comments
    1399             :  * there.
    1400             :  */
    1401           0 : void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
    1402             :                                 spinlock_t *ptl)
    1403             : {
    1404             :         struct wait_page_queue wait_page;
    1405           0 :         wait_queue_entry_t *wait = &wait_page.wait;
    1406           0 :         bool thrashing = false;
    1407           0 :         bool delayacct = false;
    1408             :         unsigned long pflags;
    1409             :         wait_queue_head_t *q;
    1410           0 :         struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
    1411             : 
    1412           0 :         q = folio_waitqueue(folio);
    1413           0 :         if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
    1414           0 :                 if (!folio_test_swapbacked(folio)) {
    1415             :                         delayacct_thrashing_start();
    1416             :                         delayacct = true;
    1417             :                 }
    1418             :                 psi_memstall_enter(&pflags);
    1419             :                 thrashing = true;
    1420             :         }
    1421             : 
    1422           0 :         init_wait(wait);
    1423           0 :         wait->func = wake_page_function;
    1424           0 :         wait_page.folio = folio;
    1425           0 :         wait_page.bit_nr = PG_locked;
    1426             :         wait->flags = 0;
    1427             : 
    1428           0 :         spin_lock_irq(&q->lock);
    1429           0 :         folio_set_waiters(folio);
    1430           0 :         if (!folio_trylock_flag(folio, PG_locked, wait))
    1431             :                 __add_wait_queue_entry_tail(q, wait);
    1432           0 :         spin_unlock_irq(&q->lock);
    1433             : 
    1434             :         /*
    1435             :          * If a migration entry exists for the page the migration path must hold
    1436             :          * a valid reference to the page, and it must take the ptl to remove the
    1437             :          * migration entry. So the page is valid until the ptl is dropped.
    1438             :          */
    1439           0 :         if (ptep)
    1440             :                 pte_unmap_unlock(ptep, ptl);
    1441             :         else
    1442             :                 spin_unlock(ptl);
    1443             : 
    1444           0 :         for (;;) {
    1445             :                 unsigned int flags;
    1446             : 
    1447           0 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    1448             : 
    1449             :                 /* Loop until we've been woken or interrupted */
    1450           0 :                 flags = smp_load_acquire(&wait->flags);
    1451           0 :                 if (!(flags & WQ_FLAG_WOKEN)) {
    1452           0 :                         if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
    1453             :                                 break;
    1454             : 
    1455           0 :                         io_schedule();
    1456           0 :                         continue;
    1457             :                 }
    1458             :                 break;
    1459             :         }
    1460             : 
    1461           0 :         finish_wait(q, wait);
    1462             : 
    1463             :         if (thrashing) {
    1464             :                 if (delayacct)
    1465             :                         delayacct_thrashing_end();
    1466             :                 psi_memstall_leave(&pflags);
    1467             :         }
    1468           0 : }
    1469             : #endif
    1470             : 
    1471           0 : void folio_wait_bit(struct folio *folio, int bit_nr)
    1472             : {
    1473           0 :         folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
    1474           0 : }
    1475             : EXPORT_SYMBOL(folio_wait_bit);
    1476             : 
    1477           0 : int folio_wait_bit_killable(struct folio *folio, int bit_nr)
    1478             : {
    1479           0 :         return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
    1480             : }
    1481             : EXPORT_SYMBOL(folio_wait_bit_killable);
    1482             : 
    1483             : /**
    1484             :  * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
    1485             :  * @folio: The folio to wait for.
    1486             :  * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
    1487             :  *
    1488             :  * The caller should hold a reference on @folio.  They expect the page to
    1489             :  * become unlocked relatively soon, but do not wish to hold up migration
    1490             :  * (for example) by holding the reference while waiting for the folio to
    1491             :  * come unlocked.  After this function returns, the caller should not
    1492             :  * dereference @folio.
    1493             :  *
    1494             :  * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
    1495             :  */
    1496           0 : int folio_put_wait_locked(struct folio *folio, int state)
    1497             : {
    1498           0 :         return folio_wait_bit_common(folio, PG_locked, state, DROP);
    1499             : }
    1500             : 
    1501             : /**
    1502             :  * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
    1503             :  * @folio: Folio defining the wait queue of interest
    1504             :  * @waiter: Waiter to add to the queue
    1505             :  *
    1506             :  * Add an arbitrary @waiter to the wait queue for the nominated @folio.
    1507             :  */
    1508           0 : void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
    1509             : {
    1510           0 :         wait_queue_head_t *q = folio_waitqueue(folio);
    1511             :         unsigned long flags;
    1512             : 
    1513           0 :         spin_lock_irqsave(&q->lock, flags);
    1514           0 :         __add_wait_queue_entry_tail(q, waiter);
    1515           0 :         folio_set_waiters(folio);
    1516           0 :         spin_unlock_irqrestore(&q->lock, flags);
    1517           0 : }
    1518             : EXPORT_SYMBOL_GPL(folio_add_wait_queue);
    1519             : 
    1520             : #ifndef clear_bit_unlock_is_negative_byte
    1521             : 
    1522             : /*
    1523             :  * PG_waiters is the high bit in the same byte as PG_lock.
    1524             :  *
    1525             :  * On x86 (and on many other architectures), we can clear PG_lock and
    1526             :  * test the sign bit at the same time. But if the architecture does
    1527             :  * not support that special operation, we just do this all by hand
    1528             :  * instead.
    1529             :  *
    1530             :  * The read of PG_waiters has to be after (or concurrently with) PG_locked
    1531             :  * being cleared, but a memory barrier should be unnecessary since it is
    1532             :  * in the same byte as PG_locked.
    1533             :  */
    1534             : static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
    1535             : {
    1536             :         clear_bit_unlock(nr, mem);
    1537             :         /* smp_mb__after_atomic(); */
    1538             :         return test_bit(PG_waiters, mem);
    1539             : }
    1540             : 
    1541             : #endif
    1542             : 
    1543             : /**
    1544             :  * folio_unlock - Unlock a locked folio.
    1545             :  * @folio: The folio.
    1546             :  *
    1547             :  * Unlocks the folio and wakes up any thread sleeping on the page lock.
    1548             :  *
    1549             :  * Context: May be called from interrupt or process context.  May not be
    1550             :  * called from NMI context.
    1551             :  */
    1552           0 : void folio_unlock(struct folio *folio)
    1553             : {
    1554             :         /* Bit 7 allows x86 to check the byte's sign bit */
    1555             :         BUILD_BUG_ON(PG_waiters != 7);
    1556             :         BUILD_BUG_ON(PG_locked > 7);
    1557             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    1558           0 :         if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
    1559           0 :                 folio_wake_bit(folio, PG_locked);
    1560           0 : }
    1561             : EXPORT_SYMBOL(folio_unlock);
    1562             : 
    1563             : /**
    1564             :  * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
    1565             :  * @folio: The folio.
    1566             :  *
    1567             :  * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
    1568             :  * it.  The folio reference held for PG_private_2 being set is released.
    1569             :  *
    1570             :  * This is, for example, used when a netfs folio is being written to a local
    1571             :  * disk cache, thereby allowing writes to the cache for the same folio to be
    1572             :  * serialised.
    1573             :  */
    1574           0 : void folio_end_private_2(struct folio *folio)
    1575             : {
    1576             :         VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
    1577           0 :         clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
    1578           0 :         folio_wake_bit(folio, PG_private_2);
    1579           0 :         folio_put(folio);
    1580           0 : }
    1581             : EXPORT_SYMBOL(folio_end_private_2);
    1582             : 
    1583             : /**
    1584             :  * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
    1585             :  * @folio: The folio to wait on.
    1586             :  *
    1587             :  * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
    1588             :  */
    1589           0 : void folio_wait_private_2(struct folio *folio)
    1590             : {
    1591           0 :         while (folio_test_private_2(folio))
    1592             :                 folio_wait_bit(folio, PG_private_2);
    1593           0 : }
    1594             : EXPORT_SYMBOL(folio_wait_private_2);
    1595             : 
    1596             : /**
    1597             :  * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
    1598             :  * @folio: The folio to wait on.
    1599             :  *
    1600             :  * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
    1601             :  * fatal signal is received by the calling task.
    1602             :  *
    1603             :  * Return:
    1604             :  * - 0 if successful.
    1605             :  * - -EINTR if a fatal signal was encountered.
    1606             :  */
    1607           0 : int folio_wait_private_2_killable(struct folio *folio)
    1608             : {
    1609           0 :         int ret = 0;
    1610             : 
    1611           0 :         while (folio_test_private_2(folio)) {
    1612           0 :                 ret = folio_wait_bit_killable(folio, PG_private_2);
    1613           0 :                 if (ret < 0)
    1614             :                         break;
    1615             :         }
    1616             : 
    1617           0 :         return ret;
    1618             : }
    1619             : EXPORT_SYMBOL(folio_wait_private_2_killable);
    1620             : 
    1621             : /**
    1622             :  * folio_end_writeback - End writeback against a folio.
    1623             :  * @folio: The folio.
    1624             :  */
    1625           0 : void folio_end_writeback(struct folio *folio)
    1626             : {
    1627             :         /*
    1628             :          * folio_test_clear_reclaim() could be used here but it is an
    1629             :          * atomic operation and overkill in this particular case. Failing
    1630             :          * to shuffle a folio marked for immediate reclaim is too mild
    1631             :          * a gain to justify taking an atomic operation penalty at the
    1632             :          * end of every folio writeback.
    1633             :          */
    1634           0 :         if (folio_test_reclaim(folio)) {
    1635           0 :                 folio_clear_reclaim(folio);
    1636           0 :                 folio_rotate_reclaimable(folio);
    1637             :         }
    1638             : 
    1639             :         /*
    1640             :          * Writeback does not hold a folio reference of its own, relying
    1641             :          * on truncation to wait for the clearing of PG_writeback.
    1642             :          * But here we must make sure that the folio is not freed and
    1643             :          * reused before the folio_wake().
    1644             :          */
    1645           0 :         folio_get(folio);
    1646           0 :         if (!__folio_end_writeback(folio))
    1647           0 :                 BUG();
    1648             : 
    1649           0 :         smp_mb__after_atomic();
    1650           0 :         folio_wake(folio, PG_writeback);
    1651           0 :         acct_reclaim_writeback(folio);
    1652           0 :         folio_put(folio);
    1653           0 : }
    1654             : EXPORT_SYMBOL(folio_end_writeback);
    1655             : 
    1656             : /*
    1657             :  * After completing I/O on a page, call this routine to update the page
    1658             :  * flags appropriately
    1659             :  */
    1660           0 : void page_endio(struct page *page, bool is_write, int err)
    1661             : {
    1662           0 :         if (!is_write) {
    1663           0 :                 if (!err) {
    1664             :                         SetPageUptodate(page);
    1665             :                 } else {
    1666           0 :                         ClearPageUptodate(page);
    1667             :                         SetPageError(page);
    1668             :                 }
    1669           0 :                 unlock_page(page);
    1670             :         } else {
    1671           0 :                 if (err) {
    1672             :                         struct address_space *mapping;
    1673             : 
    1674           0 :                         SetPageError(page);
    1675           0 :                         mapping = page_mapping(page);
    1676           0 :                         if (mapping)
    1677           0 :                                 mapping_set_error(mapping, err);
    1678             :                 }
    1679           0 :                 end_page_writeback(page);
    1680             :         }
    1681           0 : }
    1682             : EXPORT_SYMBOL_GPL(page_endio);
    1683             : 
    1684             : /**
    1685             :  * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
    1686             :  * @folio: The folio to lock
    1687             :  */
    1688           0 : void __folio_lock(struct folio *folio)
    1689             : {
    1690           0 :         folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
    1691             :                                 EXCLUSIVE);
    1692           0 : }
    1693             : EXPORT_SYMBOL(__folio_lock);
    1694             : 
    1695           0 : int __folio_lock_killable(struct folio *folio)
    1696             : {
    1697           0 :         return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
    1698             :                                         EXCLUSIVE);
    1699             : }
    1700             : EXPORT_SYMBOL_GPL(__folio_lock_killable);
    1701             : 
    1702           0 : static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
    1703             : {
    1704           0 :         struct wait_queue_head *q = folio_waitqueue(folio);
    1705           0 :         int ret = 0;
    1706             : 
    1707           0 :         wait->folio = folio;
    1708           0 :         wait->bit_nr = PG_locked;
    1709             : 
    1710           0 :         spin_lock_irq(&q->lock);
    1711           0 :         __add_wait_queue_entry_tail(q, &wait->wait);
    1712           0 :         folio_set_waiters(folio);
    1713           0 :         ret = !folio_trylock(folio);
    1714             :         /*
    1715             :          * If we were successful now, we know we're still on the
    1716             :          * waitqueue as we're still under the lock. This means it's
    1717             :          * safe to remove and return success, we know the callback
    1718             :          * isn't going to trigger.
    1719             :          */
    1720           0 :         if (!ret)
    1721           0 :                 __remove_wait_queue(q, &wait->wait);
    1722             :         else
    1723             :                 ret = -EIOCBQUEUED;
    1724           0 :         spin_unlock_irq(&q->lock);
    1725           0 :         return ret;
    1726             : }
    1727             : 
    1728             : /*
    1729             :  * Return values:
    1730             :  * true - folio is locked; mmap_lock is still held.
    1731             :  * false - folio is not locked.
    1732             :  *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
    1733             :  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
    1734             :  *     which case mmap_lock is still held.
    1735             :  *
    1736             :  * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
    1737             :  * with the folio locked and the mmap_lock unperturbed.
    1738             :  */
    1739           0 : bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
    1740             :                          unsigned int flags)
    1741             : {
    1742           0 :         if (fault_flag_allow_retry_first(flags)) {
    1743             :                 /*
    1744             :                  * CAUTION! In this case, mmap_lock is not released
    1745             :                  * even though return 0.
    1746             :                  */
    1747           0 :                 if (flags & FAULT_FLAG_RETRY_NOWAIT)
    1748             :                         return false;
    1749             : 
    1750           0 :                 mmap_read_unlock(mm);
    1751           0 :                 if (flags & FAULT_FLAG_KILLABLE)
    1752           0 :                         folio_wait_locked_killable(folio);
    1753             :                 else
    1754             :                         folio_wait_locked(folio);
    1755             :                 return false;
    1756             :         }
    1757           0 :         if (flags & FAULT_FLAG_KILLABLE) {
    1758             :                 bool ret;
    1759             : 
    1760           0 :                 ret = __folio_lock_killable(folio);
    1761           0 :                 if (ret) {
    1762           0 :                         mmap_read_unlock(mm);
    1763           0 :                         return false;
    1764             :                 }
    1765             :         } else {
    1766             :                 __folio_lock(folio);
    1767             :         }
    1768             : 
    1769             :         return true;
    1770             : }
    1771             : 
    1772             : /**
    1773             :  * page_cache_next_miss() - Find the next gap in the page cache.
    1774             :  * @mapping: Mapping.
    1775             :  * @index: Index.
    1776             :  * @max_scan: Maximum range to search.
    1777             :  *
    1778             :  * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
    1779             :  * gap with the lowest index.
    1780             :  *
    1781             :  * This function may be called under the rcu_read_lock.  However, this will
    1782             :  * not atomically search a snapshot of the cache at a single point in time.
    1783             :  * For example, if a gap is created at index 5, then subsequently a gap is
    1784             :  * created at index 10, page_cache_next_miss covering both indices may
    1785             :  * return 10 if called under the rcu_read_lock.
    1786             :  *
    1787             :  * Return: The index of the gap if found, otherwise an index outside the
    1788             :  * range specified (in which case 'return - index >= max_scan' will be true).
    1789             :  * In the rare case of index wrap-around, 0 will be returned.
    1790             :  */
    1791           0 : pgoff_t page_cache_next_miss(struct address_space *mapping,
    1792             :                              pgoff_t index, unsigned long max_scan)
    1793             : {
    1794           0 :         XA_STATE(xas, &mapping->i_pages, index);
    1795             : 
    1796           0 :         while (max_scan--) {
    1797           0 :                 void *entry = xas_next(&xas);
    1798           0 :                 if (!entry || xa_is_value(entry))
    1799             :                         break;
    1800           0 :                 if (xas.xa_index == 0)
    1801             :                         break;
    1802             :         }
    1803             : 
    1804           0 :         return xas.xa_index;
    1805             : }
    1806             : EXPORT_SYMBOL(page_cache_next_miss);
    1807             : 
    1808             : /**
    1809             :  * page_cache_prev_miss() - Find the previous gap in the page cache.
    1810             :  * @mapping: Mapping.
    1811             :  * @index: Index.
    1812             :  * @max_scan: Maximum range to search.
    1813             :  *
    1814             :  * Search the range [max(index - max_scan + 1, 0), index] for the
    1815             :  * gap with the highest index.
    1816             :  *
    1817             :  * This function may be called under the rcu_read_lock.  However, this will
    1818             :  * not atomically search a snapshot of the cache at a single point in time.
    1819             :  * For example, if a gap is created at index 10, then subsequently a gap is
    1820             :  * created at index 5, page_cache_prev_miss() covering both indices may
    1821             :  * return 5 if called under the rcu_read_lock.
    1822             :  *
    1823             :  * Return: The index of the gap if found, otherwise an index outside the
    1824             :  * range specified (in which case 'index - return >= max_scan' will be true).
    1825             :  * In the rare case of wrap-around, ULONG_MAX will be returned.
    1826             :  */
    1827           0 : pgoff_t page_cache_prev_miss(struct address_space *mapping,
    1828             :                              pgoff_t index, unsigned long max_scan)
    1829             : {
    1830           0 :         XA_STATE(xas, &mapping->i_pages, index);
    1831             : 
    1832           0 :         while (max_scan--) {
    1833           0 :                 void *entry = xas_prev(&xas);
    1834           0 :                 if (!entry || xa_is_value(entry))
    1835             :                         break;
    1836           0 :                 if (xas.xa_index == ULONG_MAX)
    1837             :                         break;
    1838             :         }
    1839             : 
    1840           0 :         return xas.xa_index;
    1841             : }
    1842             : EXPORT_SYMBOL(page_cache_prev_miss);
    1843             : 
    1844             : /*
    1845             :  * Lockless page cache protocol:
    1846             :  * On the lookup side:
    1847             :  * 1. Load the folio from i_pages
    1848             :  * 2. Increment the refcount if it's not zero
    1849             :  * 3. If the folio is not found by xas_reload(), put the refcount and retry
    1850             :  *
    1851             :  * On the removal side:
    1852             :  * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
    1853             :  * B. Remove the page from i_pages
    1854             :  * C. Return the page to the page allocator
    1855             :  *
    1856             :  * This means that any page may have its reference count temporarily
    1857             :  * increased by a speculative page cache (or fast GUP) lookup as it can
    1858             :  * be allocated by another user before the RCU grace period expires.
    1859             :  * Because the refcount temporarily acquired here may end up being the
    1860             :  * last refcount on the page, any page allocation must be freeable by
    1861             :  * folio_put().
    1862             :  */
    1863             : 
    1864             : /*
    1865             :  * mapping_get_entry - Get a page cache entry.
    1866             :  * @mapping: the address_space to search
    1867             :  * @index: The page cache index.
    1868             :  *
    1869             :  * Looks up the page cache entry at @mapping & @index.  If it is a folio,
    1870             :  * it is returned with an increased refcount.  If it is a shadow entry
    1871             :  * of a previously evicted folio, or a swap entry from shmem/tmpfs,
    1872             :  * it is returned without further action.
    1873             :  *
    1874             :  * Return: The folio, swap or shadow entry, %NULL if nothing is found.
    1875             :  */
    1876           0 : static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
    1877             : {
    1878           0 :         XA_STATE(xas, &mapping->i_pages, index);
    1879             :         struct folio *folio;
    1880             : 
    1881             :         rcu_read_lock();
    1882             : repeat:
    1883           0 :         xas_reset(&xas);
    1884           0 :         folio = xas_load(&xas);
    1885           0 :         if (xas_retry(&xas, folio))
    1886             :                 goto repeat;
    1887             :         /*
    1888             :          * A shadow entry of a recently evicted page, or a swap entry from
    1889             :          * shmem/tmpfs.  Return it without attempting to raise page count.
    1890             :          */
    1891           0 :         if (!folio || xa_is_value(folio))
    1892             :                 goto out;
    1893             : 
    1894           0 :         if (!folio_try_get_rcu(folio))
    1895             :                 goto repeat;
    1896             : 
    1897           0 :         if (unlikely(folio != xas_reload(&xas))) {
    1898             :                 folio_put(folio);
    1899             :                 goto repeat;
    1900             :         }
    1901             : out:
    1902             :         rcu_read_unlock();
    1903             : 
    1904           0 :         return folio;
    1905             : }
    1906             : 
    1907             : /**
    1908             :  * __filemap_get_folio - Find and get a reference to a folio.
    1909             :  * @mapping: The address_space to search.
    1910             :  * @index: The page index.
    1911             :  * @fgp_flags: %FGP flags modify how the folio is returned.
    1912             :  * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
    1913             :  *
    1914             :  * Looks up the page cache entry at @mapping & @index.
    1915             :  *
    1916             :  * @fgp_flags can be zero or more of these flags:
    1917             :  *
    1918             :  * * %FGP_ACCESSED - The folio will be marked accessed.
    1919             :  * * %FGP_LOCK - The folio is returned locked.
    1920             :  * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
    1921             :  *   instead of allocating a new folio to replace it.
    1922             :  * * %FGP_CREAT - If no page is present then a new page is allocated using
    1923             :  *   @gfp and added to the page cache and the VM's LRU list.
    1924             :  *   The page is returned locked and with an increased refcount.
    1925             :  * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
    1926             :  *   page is already in cache.  If the page was allocated, unlock it before
    1927             :  *   returning so the caller can do the same dance.
    1928             :  * * %FGP_WRITE - The page will be written to by the caller.
    1929             :  * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
    1930             :  * * %FGP_NOWAIT - Don't get blocked by page lock.
    1931             :  * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
    1932             :  *
    1933             :  * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
    1934             :  * if the %GFP flags specified for %FGP_CREAT are atomic.
    1935             :  *
    1936             :  * If there is a page cache page, it is returned with an increased refcount.
    1937             :  *
    1938             :  * Return: The found folio or %NULL otherwise.
    1939             :  */
    1940           0 : struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
    1941             :                 int fgp_flags, gfp_t gfp)
    1942             : {
    1943             :         struct folio *folio;
    1944             : 
    1945             : repeat:
    1946           0 :         folio = mapping_get_entry(mapping, index);
    1947           0 :         if (xa_is_value(folio)) {
    1948           0 :                 if (fgp_flags & FGP_ENTRY)
    1949             :                         return folio;
    1950             :                 folio = NULL;
    1951             :         }
    1952           0 :         if (!folio)
    1953             :                 goto no_page;
    1954             : 
    1955           0 :         if (fgp_flags & FGP_LOCK) {
    1956           0 :                 if (fgp_flags & FGP_NOWAIT) {
    1957           0 :                         if (!folio_trylock(folio)) {
    1958             :                                 folio_put(folio);
    1959             :                                 return NULL;
    1960             :                         }
    1961             :                 } else {
    1962           0 :                         folio_lock(folio);
    1963             :                 }
    1964             : 
    1965             :                 /* Has the page been truncated? */
    1966           0 :                 if (unlikely(folio->mapping != mapping)) {
    1967           0 :                         folio_unlock(folio);
    1968             :                         folio_put(folio);
    1969             :                         goto repeat;
    1970             :                 }
    1971             :                 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
    1972             :         }
    1973             : 
    1974           0 :         if (fgp_flags & FGP_ACCESSED)
    1975           0 :                 folio_mark_accessed(folio);
    1976             :         else if (fgp_flags & FGP_WRITE) {
    1977             :                 /* Clear idle flag for buffer write */
    1978             :                 if (folio_test_idle(folio))
    1979             :                         folio_clear_idle(folio);
    1980             :         }
    1981             : 
    1982           0 :         if (fgp_flags & FGP_STABLE)
    1983           0 :                 folio_wait_stable(folio);
    1984             : no_page:
    1985           0 :         if (!folio && (fgp_flags & FGP_CREAT)) {
    1986             :                 int err;
    1987           0 :                 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
    1988           0 :                         gfp |= __GFP_WRITE;
    1989           0 :                 if (fgp_flags & FGP_NOFS)
    1990           0 :                         gfp &= ~__GFP_FS;
    1991             : 
    1992           0 :                 folio = filemap_alloc_folio(gfp, 0);
    1993           0 :                 if (!folio)
    1994             :                         return NULL;
    1995             : 
    1996           0 :                 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
    1997           0 :                         fgp_flags |= FGP_LOCK;
    1998             : 
    1999             :                 /* Init accessed so avoid atomic mark_page_accessed later */
    2000           0 :                 if (fgp_flags & FGP_ACCESSED)
    2001             :                         __folio_set_referenced(folio);
    2002             : 
    2003           0 :                 err = filemap_add_folio(mapping, folio, index, gfp);
    2004           0 :                 if (unlikely(err)) {
    2005           0 :                         folio_put(folio);
    2006           0 :                         folio = NULL;
    2007           0 :                         if (err == -EEXIST)
    2008             :                                 goto repeat;
    2009             :                 }
    2010             : 
    2011             :                 /*
    2012             :                  * filemap_add_folio locks the page, and for mmap
    2013             :                  * we expect an unlocked page.
    2014             :                  */
    2015           0 :                 if (folio && (fgp_flags & FGP_FOR_MMAP))
    2016             :                         folio_unlock(folio);
    2017             :         }
    2018             : 
    2019             :         return folio;
    2020             : }
    2021             : EXPORT_SYMBOL(__filemap_get_folio);
    2022             : 
    2023           0 : static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
    2024             :                 xa_mark_t mark)
    2025             : {
    2026             :         struct folio *folio;
    2027             : 
    2028             : retry:
    2029           0 :         if (mark == XA_PRESENT)
    2030           0 :                 folio = xas_find(xas, max);
    2031             :         else
    2032           0 :                 folio = xas_find_marked(xas, max, mark);
    2033             : 
    2034           0 :         if (xas_retry(xas, folio))
    2035             :                 goto retry;
    2036             :         /*
    2037             :          * A shadow entry of a recently evicted page, a swap
    2038             :          * entry from shmem/tmpfs or a DAX entry.  Return it
    2039             :          * without attempting to raise page count.
    2040             :          */
    2041           0 :         if (!folio || xa_is_value(folio))
    2042             :                 return folio;
    2043             : 
    2044           0 :         if (!folio_try_get_rcu(folio))
    2045             :                 goto reset;
    2046             : 
    2047           0 :         if (unlikely(folio != xas_reload(xas))) {
    2048             :                 folio_put(folio);
    2049             :                 goto reset;
    2050             :         }
    2051             : 
    2052             :         return folio;
    2053             : reset:
    2054           0 :         xas_reset(xas);
    2055             :         goto retry;
    2056             : }
    2057             : 
    2058             : /**
    2059             :  * find_get_entries - gang pagecache lookup
    2060             :  * @mapping:    The address_space to search
    2061             :  * @start:      The starting page cache index
    2062             :  * @end:        The final page index (inclusive).
    2063             :  * @fbatch:     Where the resulting entries are placed.
    2064             :  * @indices:    The cache indices corresponding to the entries in @entries
    2065             :  *
    2066             :  * find_get_entries() will search for and return a batch of entries in
    2067             :  * the mapping.  The entries are placed in @fbatch.  find_get_entries()
    2068             :  * takes a reference on any actual folios it returns.
    2069             :  *
    2070             :  * The entries have ascending indexes.  The indices may not be consecutive
    2071             :  * due to not-present entries or large folios.
    2072             :  *
    2073             :  * Any shadow entries of evicted folios, or swap entries from
    2074             :  * shmem/tmpfs, are included in the returned array.
    2075             :  *
    2076             :  * Return: The number of entries which were found.
    2077             :  */
    2078           0 : unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
    2079             :                 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
    2080             : {
    2081           0 :         XA_STATE(xas, &mapping->i_pages, start);
    2082             :         struct folio *folio;
    2083             : 
    2084             :         rcu_read_lock();
    2085           0 :         while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
    2086           0 :                 indices[fbatch->nr] = xas.xa_index;
    2087           0 :                 if (!folio_batch_add(fbatch, folio))
    2088             :                         break;
    2089             :         }
    2090             :         rcu_read_unlock();
    2091             : 
    2092           0 :         return folio_batch_count(fbatch);
    2093             : }
    2094             : 
    2095             : /**
    2096             :  * find_lock_entries - Find a batch of pagecache entries.
    2097             :  * @mapping:    The address_space to search.
    2098             :  * @start:      The starting page cache index.
    2099             :  * @end:        The final page index (inclusive).
    2100             :  * @fbatch:     Where the resulting entries are placed.
    2101             :  * @indices:    The cache indices of the entries in @fbatch.
    2102             :  *
    2103             :  * find_lock_entries() will return a batch of entries from @mapping.
    2104             :  * Swap, shadow and DAX entries are included.  Folios are returned
    2105             :  * locked and with an incremented refcount.  Folios which are locked
    2106             :  * by somebody else or under writeback are skipped.  Folios which are
    2107             :  * partially outside the range are not returned.
    2108             :  *
    2109             :  * The entries have ascending indexes.  The indices may not be consecutive
    2110             :  * due to not-present entries, large folios, folios which could not be
    2111             :  * locked or folios under writeback.
    2112             :  *
    2113             :  * Return: The number of entries which were found.
    2114             :  */
    2115           0 : unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
    2116             :                 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
    2117             : {
    2118           0 :         XA_STATE(xas, &mapping->i_pages, start);
    2119             :         struct folio *folio;
    2120             : 
    2121             :         rcu_read_lock();
    2122           0 :         while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
    2123           0 :                 if (!xa_is_value(folio)) {
    2124           0 :                         if (folio->index < start)
    2125             :                                 goto put;
    2126           0 :                         if (folio->index + folio_nr_pages(folio) - 1 > end)
    2127             :                                 goto put;
    2128           0 :                         if (!folio_trylock(folio))
    2129             :                                 goto put;
    2130           0 :                         if (folio->mapping != mapping ||
    2131           0 :                             folio_test_writeback(folio))
    2132             :                                 goto unlock;
    2133             :                         VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
    2134             :                                         folio);
    2135             :                 }
    2136           0 :                 indices[fbatch->nr] = xas.xa_index;
    2137           0 :                 if (!folio_batch_add(fbatch, folio))
    2138             :                         break;
    2139           0 :                 continue;
    2140             : unlock:
    2141             :                 folio_unlock(folio);
    2142             : put:
    2143             :                 folio_put(folio);
    2144             :         }
    2145             :         rcu_read_unlock();
    2146             : 
    2147           0 :         return folio_batch_count(fbatch);
    2148             : }
    2149             : 
    2150             : static inline
    2151             : bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
    2152             : {
    2153           0 :         if (!folio_test_large(folio) || folio_test_hugetlb(folio))
    2154             :                 return false;
    2155           0 :         if (index >= max)
    2156             :                 return false;
    2157           0 :         return index < folio->index + folio_nr_pages(folio) - 1;
    2158             : }
    2159             : 
    2160             : /**
    2161             :  * find_get_pages_range - gang pagecache lookup
    2162             :  * @mapping:    The address_space to search
    2163             :  * @start:      The starting page index
    2164             :  * @end:        The final page index (inclusive)
    2165             :  * @nr_pages:   The maximum number of pages
    2166             :  * @pages:      Where the resulting pages are placed
    2167             :  *
    2168             :  * find_get_pages_range() will search for and return a group of up to @nr_pages
    2169             :  * pages in the mapping starting at index @start and up to index @end
    2170             :  * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
    2171             :  * a reference against the returned pages.
    2172             :  *
    2173             :  * The search returns a group of mapping-contiguous pages with ascending
    2174             :  * indexes.  There may be holes in the indices due to not-present pages.
    2175             :  * We also update @start to index the next page for the traversal.
    2176             :  *
    2177             :  * Return: the number of pages which were found. If this number is
    2178             :  * smaller than @nr_pages, the end of specified range has been
    2179             :  * reached.
    2180             :  */
    2181           0 : unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
    2182             :                               pgoff_t end, unsigned int nr_pages,
    2183             :                               struct page **pages)
    2184             : {
    2185           0 :         XA_STATE(xas, &mapping->i_pages, *start);
    2186             :         struct folio *folio;
    2187           0 :         unsigned ret = 0;
    2188             : 
    2189           0 :         if (unlikely(!nr_pages))
    2190             :                 return 0;
    2191             : 
    2192             :         rcu_read_lock();
    2193           0 :         while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
    2194             :                 /* Skip over shadow, swap and DAX entries */
    2195           0 :                 if (xa_is_value(folio))
    2196           0 :                         continue;
    2197             : 
    2198             : again:
    2199           0 :                 pages[ret] = folio_file_page(folio, xas.xa_index);
    2200           0 :                 if (++ret == nr_pages) {
    2201           0 :                         *start = xas.xa_index + 1;
    2202           0 :                         goto out;
    2203             :                 }
    2204           0 :                 if (folio_more_pages(folio, xas.xa_index, end)) {
    2205           0 :                         xas.xa_index++;
    2206             :                         folio_ref_inc(folio);
    2207             :                         goto again;
    2208             :                 }
    2209             :         }
    2210             : 
    2211             :         /*
    2212             :          * We come here when there is no page beyond @end. We take care to not
    2213             :          * overflow the index @start as it confuses some of the callers. This
    2214             :          * breaks the iteration when there is a page at index -1 but that is
    2215             :          * already broken anyway.
    2216             :          */
    2217           0 :         if (end == (pgoff_t)-1)
    2218           0 :                 *start = (pgoff_t)-1;
    2219             :         else
    2220           0 :                 *start = end + 1;
    2221             : out:
    2222             :         rcu_read_unlock();
    2223             : 
    2224           0 :         return ret;
    2225             : }
    2226             : 
    2227             : /**
    2228             :  * find_get_pages_contig - gang contiguous pagecache lookup
    2229             :  * @mapping:    The address_space to search
    2230             :  * @index:      The starting page index
    2231             :  * @nr_pages:   The maximum number of pages
    2232             :  * @pages:      Where the resulting pages are placed
    2233             :  *
    2234             :  * find_get_pages_contig() works exactly like find_get_pages_range(),
    2235             :  * except that the returned number of pages are guaranteed to be
    2236             :  * contiguous.
    2237             :  *
    2238             :  * Return: the number of pages which were found.
    2239             :  */
    2240           0 : unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
    2241             :                                unsigned int nr_pages, struct page **pages)
    2242             : {
    2243           0 :         XA_STATE(xas, &mapping->i_pages, index);
    2244             :         struct folio *folio;
    2245           0 :         unsigned int ret = 0;
    2246             : 
    2247           0 :         if (unlikely(!nr_pages))
    2248             :                 return 0;
    2249             : 
    2250             :         rcu_read_lock();
    2251           0 :         for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
    2252           0 :                 if (xas_retry(&xas, folio))
    2253           0 :                         continue;
    2254             :                 /*
    2255             :                  * If the entry has been swapped out, we can stop looking.
    2256             :                  * No current caller is looking for DAX entries.
    2257             :                  */
    2258           0 :                 if (xa_is_value(folio))
    2259             :                         break;
    2260             : 
    2261           0 :                 if (!folio_try_get_rcu(folio))
    2262             :                         goto retry;
    2263             : 
    2264           0 :                 if (unlikely(folio != xas_reload(&xas)))
    2265             :                         goto put_page;
    2266             : 
    2267             : again:
    2268           0 :                 pages[ret] = folio_file_page(folio, xas.xa_index);
    2269           0 :                 if (++ret == nr_pages)
    2270             :                         break;
    2271           0 :                 if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
    2272           0 :                         xas.xa_index++;
    2273             :                         folio_ref_inc(folio);
    2274             :                         goto again;
    2275             :                 }
    2276           0 :                 continue;
    2277             : put_page:
    2278             :                 folio_put(folio);
    2279             : retry:
    2280           0 :                 xas_reset(&xas);
    2281             :         }
    2282             :         rcu_read_unlock();
    2283           0 :         return ret;
    2284             : }
    2285             : EXPORT_SYMBOL(find_get_pages_contig);
    2286             : 
    2287             : /**
    2288             :  * find_get_pages_range_tag - Find and return head pages matching @tag.
    2289             :  * @mapping:    the address_space to search
    2290             :  * @index:      the starting page index
    2291             :  * @end:        The final page index (inclusive)
    2292             :  * @tag:        the tag index
    2293             :  * @nr_pages:   the maximum number of pages
    2294             :  * @pages:      where the resulting pages are placed
    2295             :  *
    2296             :  * Like find_get_pages_range(), except we only return head pages which are
    2297             :  * tagged with @tag.  @index is updated to the index immediately after the
    2298             :  * last page we return, ready for the next iteration.
    2299             :  *
    2300             :  * Return: the number of pages which were found.
    2301             :  */
    2302           0 : unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
    2303             :                         pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
    2304             :                         struct page **pages)
    2305             : {
    2306           0 :         XA_STATE(xas, &mapping->i_pages, *index);
    2307             :         struct folio *folio;
    2308           0 :         unsigned ret = 0;
    2309             : 
    2310           0 :         if (unlikely(!nr_pages))
    2311             :                 return 0;
    2312             : 
    2313             :         rcu_read_lock();
    2314           0 :         while ((folio = find_get_entry(&xas, end, tag))) {
    2315             :                 /*
    2316             :                  * Shadow entries should never be tagged, but this iteration
    2317             :                  * is lockless so there is a window for page reclaim to evict
    2318             :                  * a page we saw tagged.  Skip over it.
    2319             :                  */
    2320           0 :                 if (xa_is_value(folio))
    2321           0 :                         continue;
    2322             : 
    2323           0 :                 pages[ret] = &folio->page;
    2324           0 :                 if (++ret == nr_pages) {
    2325           0 :                         *index = folio->index + folio_nr_pages(folio);
    2326           0 :                         goto out;
    2327             :                 }
    2328             :         }
    2329             : 
    2330             :         /*
    2331             :          * We come here when we got to @end. We take care to not overflow the
    2332             :          * index @index as it confuses some of the callers. This breaks the
    2333             :          * iteration when there is a page at index -1 but that is already
    2334             :          * broken anyway.
    2335             :          */
    2336           0 :         if (end == (pgoff_t)-1)
    2337           0 :                 *index = (pgoff_t)-1;
    2338             :         else
    2339           0 :                 *index = end + 1;
    2340             : out:
    2341             :         rcu_read_unlock();
    2342             : 
    2343           0 :         return ret;
    2344             : }
    2345             : EXPORT_SYMBOL(find_get_pages_range_tag);
    2346             : 
    2347             : /*
    2348             :  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
    2349             :  * a _large_ part of the i/o request. Imagine the worst scenario:
    2350             :  *
    2351             :  *      ---R__________________________________________B__________
    2352             :  *         ^ reading here                             ^ bad block(assume 4k)
    2353             :  *
    2354             :  * read(R) => miss => readahead(R...B) => media error => frustrating retries
    2355             :  * => failing the whole request => read(R) => read(R+1) =>
    2356             :  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
    2357             :  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
    2358             :  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
    2359             :  *
    2360             :  * It is going insane. Fix it by quickly scaling down the readahead size.
    2361             :  */
    2362             : static void shrink_readahead_size_eio(struct file_ra_state *ra)
    2363             : {
    2364           0 :         ra->ra_pages /= 4;
    2365             : }
    2366             : 
    2367             : /*
    2368             :  * filemap_get_read_batch - Get a batch of folios for read
    2369             :  *
    2370             :  * Get a batch of folios which represent a contiguous range of bytes in
    2371             :  * the file.  No exceptional entries will be returned.  If @index is in
    2372             :  * the middle of a folio, the entire folio will be returned.  The last
    2373             :  * folio in the batch may have the readahead flag set or the uptodate flag
    2374             :  * clear so that the caller can take the appropriate action.
    2375             :  */
    2376           0 : static void filemap_get_read_batch(struct address_space *mapping,
    2377             :                 pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
    2378             : {
    2379           0 :         XA_STATE(xas, &mapping->i_pages, index);
    2380             :         struct folio *folio;
    2381             : 
    2382             :         rcu_read_lock();
    2383           0 :         for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
    2384           0 :                 if (xas_retry(&xas, folio))
    2385           0 :                         continue;
    2386           0 :                 if (xas.xa_index > max || xa_is_value(folio))
    2387             :                         break;
    2388           0 :                 if (!folio_try_get_rcu(folio))
    2389             :                         goto retry;
    2390             : 
    2391           0 :                 if (unlikely(folio != xas_reload(&xas)))
    2392             :                         goto put_folio;
    2393             : 
    2394           0 :                 if (!folio_batch_add(fbatch, folio))
    2395             :                         break;
    2396           0 :                 if (!folio_test_uptodate(folio))
    2397             :                         break;
    2398           0 :                 if (folio_test_readahead(folio))
    2399             :                         break;
    2400           0 :                 xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
    2401           0 :                 continue;
    2402             : put_folio:
    2403             :                 folio_put(folio);
    2404             : retry:
    2405           0 :                 xas_reset(&xas);
    2406             :         }
    2407             :         rcu_read_unlock();
    2408           0 : }
    2409             : 
    2410           0 : static int filemap_read_folio(struct file *file, struct address_space *mapping,
    2411             :                 struct folio *folio)
    2412             : {
    2413             :         int error;
    2414             : 
    2415             :         /*
    2416             :          * A previous I/O error may have been due to temporary failures,
    2417             :          * eg. multipath errors.  PG_error will be set again if readpage
    2418             :          * fails.
    2419             :          */
    2420           0 :         folio_clear_error(folio);
    2421             :         /* Start the actual read. The read will unlock the page. */
    2422           0 :         error = mapping->a_ops->readpage(file, &folio->page);
    2423           0 :         if (error)
    2424             :                 return error;
    2425             : 
    2426           0 :         error = folio_wait_locked_killable(folio);
    2427           0 :         if (error)
    2428             :                 return error;
    2429           0 :         if (folio_test_uptodate(folio))
    2430             :                 return 0;
    2431           0 :         shrink_readahead_size_eio(&file->f_ra);
    2432             :         return -EIO;
    2433             : }
    2434             : 
    2435           0 : static bool filemap_range_uptodate(struct address_space *mapping,
    2436             :                 loff_t pos, struct iov_iter *iter, struct folio *folio)
    2437             : {
    2438             :         int count;
    2439             : 
    2440           0 :         if (folio_test_uptodate(folio))
    2441             :                 return true;
    2442             :         /* pipes can't handle partially uptodate pages */
    2443           0 :         if (iov_iter_is_pipe(iter))
    2444             :                 return false;
    2445           0 :         if (!mapping->a_ops->is_partially_uptodate)
    2446             :                 return false;
    2447           0 :         if (mapping->host->i_blkbits >= folio_shift(folio))
    2448             :                 return false;
    2449             : 
    2450           0 :         count = iter->count;
    2451           0 :         if (folio_pos(folio) > pos) {
    2452           0 :                 count -= folio_pos(folio) - pos;
    2453           0 :                 pos = 0;
    2454             :         } else {
    2455           0 :                 pos -= folio_pos(folio);
    2456             :         }
    2457             : 
    2458           0 :         return mapping->a_ops->is_partially_uptodate(folio, pos, count);
    2459             : }
    2460             : 
    2461           0 : static int filemap_update_page(struct kiocb *iocb,
    2462             :                 struct address_space *mapping, struct iov_iter *iter,
    2463             :                 struct folio *folio)
    2464             : {
    2465             :         int error;
    2466             : 
    2467           0 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    2468           0 :                 if (!filemap_invalidate_trylock_shared(mapping))
    2469             :                         return -EAGAIN;
    2470             :         } else {
    2471             :                 filemap_invalidate_lock_shared(mapping);
    2472             :         }
    2473             : 
    2474           0 :         if (!folio_trylock(folio)) {
    2475           0 :                 error = -EAGAIN;
    2476           0 :                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
    2477             :                         goto unlock_mapping;
    2478           0 :                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
    2479           0 :                         filemap_invalidate_unlock_shared(mapping);
    2480             :                         /*
    2481             :                          * This is where we usually end up waiting for a
    2482             :                          * previously submitted readahead to finish.
    2483             :                          */
    2484           0 :                         folio_put_wait_locked(folio, TASK_KILLABLE);
    2485           0 :                         return AOP_TRUNCATED_PAGE;
    2486             :                 }
    2487           0 :                 error = __folio_lock_async(folio, iocb->ki_waitq);
    2488           0 :                 if (error)
    2489             :                         goto unlock_mapping;
    2490             :         }
    2491             : 
    2492           0 :         error = AOP_TRUNCATED_PAGE;
    2493           0 :         if (!folio->mapping)
    2494             :                 goto unlock;
    2495             : 
    2496           0 :         error = 0;
    2497           0 :         if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
    2498             :                 goto unlock;
    2499             : 
    2500           0 :         error = -EAGAIN;
    2501           0 :         if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
    2502             :                 goto unlock;
    2503             : 
    2504           0 :         error = filemap_read_folio(iocb->ki_filp, mapping, folio);
    2505           0 :         goto unlock_mapping;
    2506             : unlock:
    2507             :         folio_unlock(folio);
    2508             : unlock_mapping:
    2509           0 :         filemap_invalidate_unlock_shared(mapping);
    2510           0 :         if (error == AOP_TRUNCATED_PAGE)
    2511             :                 folio_put(folio);
    2512             :         return error;
    2513             : }
    2514             : 
    2515           0 : static int filemap_create_folio(struct file *file,
    2516             :                 struct address_space *mapping, pgoff_t index,
    2517             :                 struct folio_batch *fbatch)
    2518             : {
    2519             :         struct folio *folio;
    2520             :         int error;
    2521             : 
    2522           0 :         folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
    2523           0 :         if (!folio)
    2524             :                 return -ENOMEM;
    2525             : 
    2526             :         /*
    2527             :          * Protect against truncate / hole punch. Grabbing invalidate_lock
    2528             :          * here assures we cannot instantiate and bring uptodate new
    2529             :          * pagecache folios after evicting page cache during truncate
    2530             :          * and before actually freeing blocks.  Note that we could
    2531             :          * release invalidate_lock after inserting the folio into
    2532             :          * the page cache as the locked folio would then be enough to
    2533             :          * synchronize with hole punching. But there are code paths
    2534             :          * such as filemap_update_page() filling in partially uptodate
    2535             :          * pages or ->readahead() that need to hold invalidate_lock
    2536             :          * while mapping blocks for IO so let's hold the lock here as
    2537             :          * well to keep locking rules simple.
    2538             :          */
    2539           0 :         filemap_invalidate_lock_shared(mapping);
    2540           0 :         error = filemap_add_folio(mapping, folio, index,
    2541             :                         mapping_gfp_constraint(mapping, GFP_KERNEL));
    2542           0 :         if (error == -EEXIST)
    2543           0 :                 error = AOP_TRUNCATED_PAGE;
    2544           0 :         if (error)
    2545             :                 goto error;
    2546             : 
    2547           0 :         error = filemap_read_folio(file, mapping, folio);
    2548           0 :         if (error)
    2549             :                 goto error;
    2550             : 
    2551           0 :         filemap_invalidate_unlock_shared(mapping);
    2552           0 :         folio_batch_add(fbatch, folio);
    2553           0 :         return 0;
    2554             : error:
    2555           0 :         filemap_invalidate_unlock_shared(mapping);
    2556             :         folio_put(folio);
    2557             :         return error;
    2558             : }
    2559             : 
    2560           0 : static int filemap_readahead(struct kiocb *iocb, struct file *file,
    2561             :                 struct address_space *mapping, struct folio *folio,
    2562             :                 pgoff_t last_index)
    2563             : {
    2564           0 :         DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
    2565             : 
    2566           0 :         if (iocb->ki_flags & IOCB_NOIO)
    2567             :                 return -EAGAIN;
    2568           0 :         page_cache_async_ra(&ractl, folio, last_index - folio->index);
    2569             :         return 0;
    2570             : }
    2571             : 
    2572           0 : static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
    2573             :                 struct folio_batch *fbatch)
    2574             : {
    2575           0 :         struct file *filp = iocb->ki_filp;
    2576           0 :         struct address_space *mapping = filp->f_mapping;
    2577           0 :         struct file_ra_state *ra = &filp->f_ra;
    2578           0 :         pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
    2579             :         pgoff_t last_index;
    2580             :         struct folio *folio;
    2581           0 :         int err = 0;
    2582             : 
    2583           0 :         last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
    2584             : retry:
    2585           0 :         if (fatal_signal_pending(current))
    2586             :                 return -EINTR;
    2587             : 
    2588           0 :         filemap_get_read_batch(mapping, index, last_index, fbatch);
    2589           0 :         if (!folio_batch_count(fbatch)) {
    2590           0 :                 if (iocb->ki_flags & IOCB_NOIO)
    2591             :                         return -EAGAIN;
    2592           0 :                 page_cache_sync_readahead(mapping, ra, filp, index,
    2593             :                                 last_index - index);
    2594           0 :                 filemap_get_read_batch(mapping, index, last_index, fbatch);
    2595             :         }
    2596           0 :         if (!folio_batch_count(fbatch)) {
    2597           0 :                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
    2598             :                         return -EAGAIN;
    2599           0 :                 err = filemap_create_folio(filp, mapping,
    2600           0 :                                 iocb->ki_pos >> PAGE_SHIFT, fbatch);
    2601           0 :                 if (err == AOP_TRUNCATED_PAGE)
    2602             :                         goto retry;
    2603             :                 return err;
    2604             :         }
    2605             : 
    2606           0 :         folio = fbatch->folios[folio_batch_count(fbatch) - 1];
    2607           0 :         if (folio_test_readahead(folio)) {
    2608           0 :                 err = filemap_readahead(iocb, filp, mapping, folio, last_index);
    2609           0 :                 if (err)
    2610             :                         goto err;
    2611             :         }
    2612           0 :         if (!folio_test_uptodate(folio)) {
    2613           0 :                 if ((iocb->ki_flags & IOCB_WAITQ) &&
    2614           0 :                     folio_batch_count(fbatch) > 1)
    2615           0 :                         iocb->ki_flags |= IOCB_NOWAIT;
    2616           0 :                 err = filemap_update_page(iocb, mapping, iter, folio);
    2617           0 :                 if (err)
    2618             :                         goto err;
    2619             :         }
    2620             : 
    2621             :         return 0;
    2622             : err:
    2623           0 :         if (err < 0)
    2624             :                 folio_put(folio);
    2625           0 :         if (likely(--fbatch->nr))
    2626             :                 return 0;
    2627           0 :         if (err == AOP_TRUNCATED_PAGE)
    2628             :                 goto retry;
    2629             :         return err;
    2630             : }
    2631             : 
    2632             : /**
    2633             :  * filemap_read - Read data from the page cache.
    2634             :  * @iocb: The iocb to read.
    2635             :  * @iter: Destination for the data.
    2636             :  * @already_read: Number of bytes already read by the caller.
    2637             :  *
    2638             :  * Copies data from the page cache.  If the data is not currently present,
    2639             :  * uses the readahead and readpage address_space operations to fetch it.
    2640             :  *
    2641             :  * Return: Total number of bytes copied, including those already read by
    2642             :  * the caller.  If an error happens before any bytes are copied, returns
    2643             :  * a negative error number.
    2644             :  */
    2645           0 : ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
    2646             :                 ssize_t already_read)
    2647             : {
    2648           0 :         struct file *filp = iocb->ki_filp;
    2649           0 :         struct file_ra_state *ra = &filp->f_ra;
    2650           0 :         struct address_space *mapping = filp->f_mapping;
    2651           0 :         struct inode *inode = mapping->host;
    2652             :         struct folio_batch fbatch;
    2653           0 :         int i, error = 0;
    2654             :         bool writably_mapped;
    2655             :         loff_t isize, end_offset;
    2656             : 
    2657           0 :         if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
    2658             :                 return 0;
    2659           0 :         if (unlikely(!iov_iter_count(iter)))
    2660             :                 return 0;
    2661             : 
    2662           0 :         iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
    2663           0 :         folio_batch_init(&fbatch);
    2664             : 
    2665             :         do {
    2666           0 :                 cond_resched();
    2667             : 
    2668             :                 /*
    2669             :                  * If we've already successfully copied some data, then we
    2670             :                  * can no longer safely return -EIOCBQUEUED. Hence mark
    2671             :                  * an async read NOWAIT at that point.
    2672             :                  */
    2673           0 :                 if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
    2674           0 :                         iocb->ki_flags |= IOCB_NOWAIT;
    2675             : 
    2676           0 :                 if (unlikely(iocb->ki_pos >= i_size_read(inode)))
    2677             :                         break;
    2678             : 
    2679           0 :                 error = filemap_get_pages(iocb, iter, &fbatch);
    2680           0 :                 if (error < 0)
    2681             :                         break;
    2682             : 
    2683             :                 /*
    2684             :                  * i_size must be checked after we know the pages are Uptodate.
    2685             :                  *
    2686             :                  * Checking i_size after the check allows us to calculate
    2687             :                  * the correct value for "nr", which means the zero-filled
    2688             :                  * part of the page is not copied back to userspace (unless
    2689             :                  * another truncate extends the file - this is desired though).
    2690             :                  */
    2691           0 :                 isize = i_size_read(inode);
    2692           0 :                 if (unlikely(iocb->ki_pos >= isize))
    2693             :                         goto put_folios;
    2694           0 :                 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
    2695             : 
    2696             :                 /*
    2697             :                  * Once we start copying data, we don't want to be touching any
    2698             :                  * cachelines that might be contended:
    2699             :                  */
    2700           0 :                 writably_mapped = mapping_writably_mapped(mapping);
    2701             : 
    2702             :                 /*
    2703             :                  * When a sequential read accesses a page several times, only
    2704             :                  * mark it as accessed the first time.
    2705             :                  */
    2706           0 :                 if (iocb->ki_pos >> PAGE_SHIFT !=
    2707           0 :                     ra->prev_pos >> PAGE_SHIFT)
    2708           0 :                         folio_mark_accessed(fbatch.folios[0]);
    2709             : 
    2710           0 :                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
    2711           0 :                         struct folio *folio = fbatch.folios[i];
    2712           0 :                         size_t fsize = folio_size(folio);
    2713           0 :                         size_t offset = iocb->ki_pos & (fsize - 1);
    2714           0 :                         size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
    2715             :                                              fsize - offset);
    2716             :                         size_t copied;
    2717             : 
    2718           0 :                         if (end_offset < folio_pos(folio))
    2719             :                                 break;
    2720           0 :                         if (i > 0)
    2721           0 :                                 folio_mark_accessed(folio);
    2722             :                         /*
    2723             :                          * If users can be writing to this folio using arbitrary
    2724             :                          * virtual addresses, take care of potential aliasing
    2725             :                          * before reading the folio on the kernel side.
    2726             :                          */
    2727             :                         if (writably_mapped)
    2728             :                                 flush_dcache_folio(folio);
    2729             : 
    2730           0 :                         copied = copy_folio_to_iter(folio, offset, bytes, iter);
    2731             : 
    2732           0 :                         already_read += copied;
    2733           0 :                         iocb->ki_pos += copied;
    2734           0 :                         ra->prev_pos = iocb->ki_pos;
    2735             : 
    2736           0 :                         if (copied < bytes) {
    2737             :                                 error = -EFAULT;
    2738             :                                 break;
    2739             :                         }
    2740             :                 }
    2741             : put_folios:
    2742           0 :                 for (i = 0; i < folio_batch_count(&fbatch); i++)
    2743           0 :                         folio_put(fbatch.folios[i]);
    2744           0 :                 folio_batch_init(&fbatch);
    2745           0 :         } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
    2746             : 
    2747           0 :         file_accessed(filp);
    2748             : 
    2749           0 :         return already_read ? already_read : error;
    2750             : }
    2751             : EXPORT_SYMBOL_GPL(filemap_read);
    2752             : 
    2753             : /**
    2754             :  * generic_file_read_iter - generic filesystem read routine
    2755             :  * @iocb:       kernel I/O control block
    2756             :  * @iter:       destination for the data read
    2757             :  *
    2758             :  * This is the "read_iter()" routine for all filesystems
    2759             :  * that can use the page cache directly.
    2760             :  *
    2761             :  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
    2762             :  * be returned when no data can be read without waiting for I/O requests
    2763             :  * to complete; it doesn't prevent readahead.
    2764             :  *
    2765             :  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
    2766             :  * requests shall be made for the read or for readahead.  When no data
    2767             :  * can be read, -EAGAIN shall be returned.  When readahead would be
    2768             :  * triggered, a partial, possibly empty read shall be returned.
    2769             :  *
    2770             :  * Return:
    2771             :  * * number of bytes copied, even for partial reads
    2772             :  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
    2773             :  */
    2774             : ssize_t
    2775           0 : generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
    2776             : {
    2777           0 :         size_t count = iov_iter_count(iter);
    2778           0 :         ssize_t retval = 0;
    2779             : 
    2780           0 :         if (!count)
    2781             :                 return 0; /* skip atime */
    2782             : 
    2783           0 :         if (iocb->ki_flags & IOCB_DIRECT) {
    2784           0 :                 struct file *file = iocb->ki_filp;
    2785           0 :                 struct address_space *mapping = file->f_mapping;
    2786           0 :                 struct inode *inode = mapping->host;
    2787             : 
    2788           0 :                 if (iocb->ki_flags & IOCB_NOWAIT) {
    2789           0 :                         if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
    2790           0 :                                                 iocb->ki_pos + count - 1))
    2791             :                                 return -EAGAIN;
    2792             :                 } else {
    2793           0 :                         retval = filemap_write_and_wait_range(mapping,
    2794             :                                                 iocb->ki_pos,
    2795           0 :                                                 iocb->ki_pos + count - 1);
    2796           0 :                         if (retval < 0)
    2797             :                                 return retval;
    2798             :                 }
    2799             : 
    2800           0 :                 file_accessed(file);
    2801             : 
    2802           0 :                 retval = mapping->a_ops->direct_IO(iocb, iter);
    2803           0 :                 if (retval >= 0) {
    2804           0 :                         iocb->ki_pos += retval;
    2805           0 :                         count -= retval;
    2806             :                 }
    2807           0 :                 if (retval != -EIOCBQUEUED)
    2808           0 :                         iov_iter_revert(iter, count - iov_iter_count(iter));
    2809             : 
    2810             :                 /*
    2811             :                  * Btrfs can have a short DIO read if we encounter
    2812             :                  * compressed extents, so if there was an error, or if
    2813             :                  * we've already read everything we wanted to, or if
    2814             :                  * there was a short read because we hit EOF, go ahead
    2815             :                  * and return.  Otherwise fallthrough to buffered io for
    2816             :                  * the rest of the read.  Buffered reads will not work for
    2817             :                  * DAX files, so don't bother trying.
    2818             :                  */
    2819           0 :                 if (retval < 0 || !count || IS_DAX(inode))
    2820             :                         return retval;
    2821           0 :                 if (iocb->ki_pos >= i_size_read(inode))
    2822             :                         return retval;
    2823             :         }
    2824             : 
    2825           0 :         return filemap_read(iocb, iter, retval);
    2826             : }
    2827             : EXPORT_SYMBOL(generic_file_read_iter);
    2828             : 
    2829           0 : static inline loff_t folio_seek_hole_data(struct xa_state *xas,
    2830             :                 struct address_space *mapping, struct folio *folio,
    2831             :                 loff_t start, loff_t end, bool seek_data)
    2832             : {
    2833           0 :         const struct address_space_operations *ops = mapping->a_ops;
    2834           0 :         size_t offset, bsz = i_blocksize(mapping->host);
    2835             : 
    2836           0 :         if (xa_is_value(folio) || folio_test_uptodate(folio))
    2837           0 :                 return seek_data ? start : end;
    2838           0 :         if (!ops->is_partially_uptodate)
    2839           0 :                 return seek_data ? end : start;
    2840             : 
    2841           0 :         xas_pause(xas);
    2842             :         rcu_read_unlock();
    2843           0 :         folio_lock(folio);
    2844           0 :         if (unlikely(folio->mapping != mapping))
    2845             :                 goto unlock;
    2846             : 
    2847           0 :         offset = offset_in_folio(folio, start) & ~(bsz - 1);
    2848             : 
    2849             :         do {
    2850           0 :                 if (ops->is_partially_uptodate(folio, offset, bsz) ==
    2851             :                                                         seek_data)
    2852             :                         break;
    2853           0 :                 start = (start + bsz) & ~(bsz - 1);
    2854           0 :                 offset += bsz;
    2855           0 :         } while (offset < folio_size(folio));
    2856             : unlock:
    2857           0 :         folio_unlock(folio);
    2858             :         rcu_read_lock();
    2859           0 :         return start;
    2860             : }
    2861             : 
    2862             : static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
    2863             : {
    2864           0 :         if (xa_is_value(folio))
    2865             :                 return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
    2866           0 :         return folio_size(folio);
    2867             : }
    2868             : 
    2869             : /**
    2870             :  * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
    2871             :  * @mapping: Address space to search.
    2872             :  * @start: First byte to consider.
    2873             :  * @end: Limit of search (exclusive).
    2874             :  * @whence: Either SEEK_HOLE or SEEK_DATA.
    2875             :  *
    2876             :  * If the page cache knows which blocks contain holes and which blocks
    2877             :  * contain data, your filesystem can use this function to implement
    2878             :  * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
    2879             :  * entirely memory-based such as tmpfs, and filesystems which support
    2880             :  * unwritten extents.
    2881             :  *
    2882             :  * Return: The requested offset on success, or -ENXIO if @whence specifies
    2883             :  * SEEK_DATA and there is no data after @start.  There is an implicit hole
    2884             :  * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
    2885             :  * and @end contain data.
    2886             :  */
    2887           0 : loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
    2888             :                 loff_t end, int whence)
    2889             : {
    2890           0 :         XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
    2891           0 :         pgoff_t max = (end - 1) >> PAGE_SHIFT;
    2892           0 :         bool seek_data = (whence == SEEK_DATA);
    2893             :         struct folio *folio;
    2894             : 
    2895           0 :         if (end <= start)
    2896             :                 return -ENXIO;
    2897             : 
    2898             :         rcu_read_lock();
    2899           0 :         while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
    2900           0 :                 loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
    2901             :                 size_t seek_size;
    2902             : 
    2903           0 :                 if (start < pos) {
    2904           0 :                         if (!seek_data)
    2905             :                                 goto unlock;
    2906             :                         start = pos;
    2907             :                 }
    2908             : 
    2909           0 :                 seek_size = seek_folio_size(&xas, folio);
    2910           0 :                 pos = round_up((u64)pos + 1, seek_size);
    2911           0 :                 start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
    2912             :                                 seek_data);
    2913           0 :                 if (start < pos)
    2914             :                         goto unlock;
    2915           0 :                 if (start >= end)
    2916             :                         break;
    2917           0 :                 if (seek_size > PAGE_SIZE)
    2918           0 :                         xas_set(&xas, pos >> PAGE_SHIFT);
    2919           0 :                 if (!xa_is_value(folio))
    2920             :                         folio_put(folio);
    2921             :         }
    2922           0 :         if (seek_data)
    2923           0 :                 start = -ENXIO;
    2924             : unlock:
    2925             :         rcu_read_unlock();
    2926           0 :         if (folio && !xa_is_value(folio))
    2927             :                 folio_put(folio);
    2928           0 :         if (start > end)
    2929             :                 return end;
    2930           0 :         return start;
    2931             : }
    2932             : 
    2933             : #ifdef CONFIG_MMU
    2934             : #define MMAP_LOTSAMISS  (100)
    2935             : /*
    2936             :  * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
    2937             :  * @vmf - the vm_fault for this fault.
    2938             :  * @folio - the folio to lock.
    2939             :  * @fpin - the pointer to the file we may pin (or is already pinned).
    2940             :  *
    2941             :  * This works similar to lock_folio_or_retry in that it can drop the
    2942             :  * mmap_lock.  It differs in that it actually returns the folio locked
    2943             :  * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
    2944             :  * to drop the mmap_lock then fpin will point to the pinned file and
    2945             :  * needs to be fput()'ed at a later point.
    2946             :  */
    2947           0 : static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
    2948             :                                      struct file **fpin)
    2949             : {
    2950           0 :         if (folio_trylock(folio))
    2951             :                 return 1;
    2952             : 
    2953             :         /*
    2954             :          * NOTE! This will make us return with VM_FAULT_RETRY, but with
    2955             :          * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
    2956             :          * is supposed to work. We have way too many special cases..
    2957             :          */
    2958           0 :         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
    2959             :                 return 0;
    2960             : 
    2961           0 :         *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
    2962           0 :         if (vmf->flags & FAULT_FLAG_KILLABLE) {
    2963           0 :                 if (__folio_lock_killable(folio)) {
    2964             :                         /*
    2965             :                          * We didn't have the right flags to drop the mmap_lock,
    2966             :                          * but all fault_handlers only check for fatal signals
    2967             :                          * if we return VM_FAULT_RETRY, so we need to drop the
    2968             :                          * mmap_lock here and return 0 if we don't have a fpin.
    2969             :                          */
    2970           0 :                         if (*fpin == NULL)
    2971           0 :                                 mmap_read_unlock(vmf->vma->vm_mm);
    2972             :                         return 0;
    2973             :                 }
    2974             :         } else
    2975             :                 __folio_lock(folio);
    2976             : 
    2977             :         return 1;
    2978             : }
    2979             : 
    2980             : /*
    2981             :  * Synchronous readahead happens when we don't even find a page in the page
    2982             :  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
    2983             :  * to drop the mmap sem we return the file that was pinned in order for us to do
    2984             :  * that.  If we didn't pin a file then we return NULL.  The file that is
    2985             :  * returned needs to be fput()'ed when we're done with it.
    2986             :  */
    2987           0 : static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
    2988             : {
    2989           0 :         struct file *file = vmf->vma->vm_file;
    2990           0 :         struct file_ra_state *ra = &file->f_ra;
    2991           0 :         struct address_space *mapping = file->f_mapping;
    2992           0 :         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
    2993           0 :         struct file *fpin = NULL;
    2994             :         unsigned int mmap_miss;
    2995             : 
    2996             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    2997             :         /* Use the readahead code, even if readahead is disabled */
    2998             :         if (vmf->vma->vm_flags & VM_HUGEPAGE) {
    2999             :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3000             :                 ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
    3001             :                 ra->size = HPAGE_PMD_NR;
    3002             :                 /*
    3003             :                  * Fetch two PMD folios, so we get the chance to actually
    3004             :                  * readahead, unless we've been told not to.
    3005             :                  */
    3006             :                 if (!(vmf->vma->vm_flags & VM_RAND_READ))
    3007             :                         ra->size *= 2;
    3008             :                 ra->async_size = HPAGE_PMD_NR;
    3009             :                 page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
    3010             :                 return fpin;
    3011             :         }
    3012             : #endif
    3013             : 
    3014             :         /* If we don't want any read-ahead, don't bother */
    3015           0 :         if (vmf->vma->vm_flags & VM_RAND_READ)
    3016             :                 return fpin;
    3017           0 :         if (!ra->ra_pages)
    3018             :                 return fpin;
    3019             : 
    3020           0 :         if (vmf->vma->vm_flags & VM_SEQ_READ) {
    3021           0 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3022           0 :                 page_cache_sync_ra(&ractl, ra->ra_pages);
    3023           0 :                 return fpin;
    3024             :         }
    3025             : 
    3026             :         /* Avoid banging the cache line if not needed */
    3027           0 :         mmap_miss = READ_ONCE(ra->mmap_miss);
    3028           0 :         if (mmap_miss < MMAP_LOTSAMISS * 10)
    3029           0 :                 WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
    3030             : 
    3031             :         /*
    3032             :          * Do we miss much more than hit in this file? If so,
    3033             :          * stop bothering with read-ahead. It will only hurt.
    3034             :          */
    3035           0 :         if (mmap_miss > MMAP_LOTSAMISS)
    3036             :                 return fpin;
    3037             : 
    3038             :         /*
    3039             :          * mmap read-around
    3040             :          */
    3041           0 :         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3042           0 :         ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
    3043           0 :         ra->size = ra->ra_pages;
    3044           0 :         ra->async_size = ra->ra_pages / 4;
    3045           0 :         ractl._index = ra->start;
    3046           0 :         page_cache_ra_order(&ractl, ra, 0);
    3047           0 :         return fpin;
    3048             : }
    3049             : 
    3050             : /*
    3051             :  * Asynchronous readahead happens when we find the page and PG_readahead,
    3052             :  * so we want to possibly extend the readahead further.  We return the file that
    3053             :  * was pinned if we have to drop the mmap_lock in order to do IO.
    3054             :  */
    3055           0 : static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
    3056             :                                             struct folio *folio)
    3057             : {
    3058           0 :         struct file *file = vmf->vma->vm_file;
    3059           0 :         struct file_ra_state *ra = &file->f_ra;
    3060           0 :         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
    3061           0 :         struct file *fpin = NULL;
    3062             :         unsigned int mmap_miss;
    3063             : 
    3064             :         /* If we don't want any read-ahead, don't bother */
    3065           0 :         if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
    3066             :                 return fpin;
    3067             : 
    3068           0 :         mmap_miss = READ_ONCE(ra->mmap_miss);
    3069           0 :         if (mmap_miss)
    3070           0 :                 WRITE_ONCE(ra->mmap_miss, --mmap_miss);
    3071             : 
    3072           0 :         if (folio_test_readahead(folio)) {
    3073           0 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3074           0 :                 page_cache_async_ra(&ractl, folio, ra->ra_pages);
    3075             :         }
    3076             :         return fpin;
    3077             : }
    3078             : 
    3079             : /**
    3080             :  * filemap_fault - read in file data for page fault handling
    3081             :  * @vmf:        struct vm_fault containing details of the fault
    3082             :  *
    3083             :  * filemap_fault() is invoked via the vma operations vector for a
    3084             :  * mapped memory region to read in file data during a page fault.
    3085             :  *
    3086             :  * The goto's are kind of ugly, but this streamlines the normal case of having
    3087             :  * it in the page cache, and handles the special cases reasonably without
    3088             :  * having a lot of duplicated code.
    3089             :  *
    3090             :  * vma->vm_mm->mmap_lock must be held on entry.
    3091             :  *
    3092             :  * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
    3093             :  * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
    3094             :  *
    3095             :  * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
    3096             :  * has not been released.
    3097             :  *
    3098             :  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
    3099             :  *
    3100             :  * Return: bitwise-OR of %VM_FAULT_ codes.
    3101             :  */
    3102           0 : vm_fault_t filemap_fault(struct vm_fault *vmf)
    3103             : {
    3104             :         int error;
    3105           0 :         struct file *file = vmf->vma->vm_file;
    3106           0 :         struct file *fpin = NULL;
    3107           0 :         struct address_space *mapping = file->f_mapping;
    3108           0 :         struct inode *inode = mapping->host;
    3109           0 :         pgoff_t max_idx, index = vmf->pgoff;
    3110             :         struct folio *folio;
    3111           0 :         vm_fault_t ret = 0;
    3112           0 :         bool mapping_locked = false;
    3113             : 
    3114           0 :         max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    3115           0 :         if (unlikely(index >= max_idx))
    3116             :                 return VM_FAULT_SIGBUS;
    3117             : 
    3118             :         /*
    3119             :          * Do we have something in the page cache already?
    3120             :          */
    3121           0 :         folio = filemap_get_folio(mapping, index);
    3122           0 :         if (likely(folio)) {
    3123             :                 /*
    3124             :                  * We found the page, so try async readahead before waiting for
    3125             :                  * the lock.
    3126             :                  */
    3127           0 :                 if (!(vmf->flags & FAULT_FLAG_TRIED))
    3128           0 :                         fpin = do_async_mmap_readahead(vmf, folio);
    3129           0 :                 if (unlikely(!folio_test_uptodate(folio))) {
    3130           0 :                         filemap_invalidate_lock_shared(mapping);
    3131           0 :                         mapping_locked = true;
    3132             :                 }
    3133             :         } else {
    3134             :                 /* No page in the page cache at all */
    3135           0 :                 count_vm_event(PGMAJFAULT);
    3136           0 :                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
    3137           0 :                 ret = VM_FAULT_MAJOR;
    3138           0 :                 fpin = do_sync_mmap_readahead(vmf);
    3139             : retry_find:
    3140             :                 /*
    3141             :                  * See comment in filemap_create_folio() why we need
    3142             :                  * invalidate_lock
    3143             :                  */
    3144           0 :                 if (!mapping_locked) {
    3145           0 :                         filemap_invalidate_lock_shared(mapping);
    3146           0 :                         mapping_locked = true;
    3147             :                 }
    3148           0 :                 folio = __filemap_get_folio(mapping, index,
    3149             :                                           FGP_CREAT|FGP_FOR_MMAP,
    3150             :                                           vmf->gfp_mask);
    3151           0 :                 if (!folio) {
    3152           0 :                         if (fpin)
    3153             :                                 goto out_retry;
    3154           0 :                         filemap_invalidate_unlock_shared(mapping);
    3155           0 :                         return VM_FAULT_OOM;
    3156             :                 }
    3157             :         }
    3158             : 
    3159           0 :         if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
    3160             :                 goto out_retry;
    3161             : 
    3162             :         /* Did it get truncated? */
    3163           0 :         if (unlikely(folio->mapping != mapping)) {
    3164           0 :                 folio_unlock(folio);
    3165             :                 folio_put(folio);
    3166             :                 goto retry_find;
    3167             :         }
    3168             :         VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
    3169             : 
    3170             :         /*
    3171             :          * We have a locked page in the page cache, now we need to check
    3172             :          * that it's up-to-date. If not, it is going to be due to an error.
    3173             :          */
    3174           0 :         if (unlikely(!folio_test_uptodate(folio))) {
    3175             :                 /*
    3176             :                  * The page was in cache and uptodate and now it is not.
    3177             :                  * Strange but possible since we didn't hold the page lock all
    3178             :                  * the time. Let's drop everything get the invalidate lock and
    3179             :                  * try again.
    3180             :                  */
    3181           0 :                 if (!mapping_locked) {
    3182           0 :                         folio_unlock(folio);
    3183             :                         folio_put(folio);
    3184             :                         goto retry_find;
    3185             :                 }
    3186             :                 goto page_not_uptodate;
    3187             :         }
    3188             : 
    3189             :         /*
    3190             :          * We've made it this far and we had to drop our mmap_lock, now is the
    3191             :          * time to return to the upper layer and have it re-find the vma and
    3192             :          * redo the fault.
    3193             :          */
    3194           0 :         if (fpin) {
    3195             :                 folio_unlock(folio);
    3196             :                 goto out_retry;
    3197             :         }
    3198           0 :         if (mapping_locked)
    3199             :                 filemap_invalidate_unlock_shared(mapping);
    3200             : 
    3201             :         /*
    3202             :          * Found the page and have a reference on it.
    3203             :          * We must recheck i_size under page lock.
    3204             :          */
    3205           0 :         max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    3206           0 :         if (unlikely(index >= max_idx)) {
    3207           0 :                 folio_unlock(folio);
    3208             :                 folio_put(folio);
    3209             :                 return VM_FAULT_SIGBUS;
    3210             :         }
    3211             : 
    3212           0 :         vmf->page = folio_file_page(folio, index);
    3213           0 :         return ret | VM_FAULT_LOCKED;
    3214             : 
    3215             : page_not_uptodate:
    3216             :         /*
    3217             :          * Umm, take care of errors if the page isn't up-to-date.
    3218             :          * Try to re-read it _once_. We do this synchronously,
    3219             :          * because there really aren't any performance issues here
    3220             :          * and we need to check for errors.
    3221             :          */
    3222           0 :         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    3223           0 :         error = filemap_read_folio(file, mapping, folio);
    3224           0 :         if (fpin)
    3225             :                 goto out_retry;
    3226           0 :         folio_put(folio);
    3227             : 
    3228           0 :         if (!error || error == AOP_TRUNCATED_PAGE)
    3229             :                 goto retry_find;
    3230           0 :         filemap_invalidate_unlock_shared(mapping);
    3231             : 
    3232           0 :         return VM_FAULT_SIGBUS;
    3233             : 
    3234             : out_retry:
    3235             :         /*
    3236             :          * We dropped the mmap_lock, we need to return to the fault handler to
    3237             :          * re-find the vma and come back and find our hopefully still populated
    3238             :          * page.
    3239             :          */
    3240           0 :         if (folio)
    3241             :                 folio_put(folio);
    3242           0 :         if (mapping_locked)
    3243             :                 filemap_invalidate_unlock_shared(mapping);
    3244           0 :         if (fpin)
    3245           0 :                 fput(fpin);
    3246           0 :         return ret | VM_FAULT_RETRY;
    3247             : }
    3248             : EXPORT_SYMBOL(filemap_fault);
    3249             : 
    3250             : static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
    3251             : {
    3252           0 :         struct mm_struct *mm = vmf->vma->vm_mm;
    3253             : 
    3254             :         /* Huge page is mapped? No need to proceed. */
    3255           0 :         if (pmd_trans_huge(*vmf->pmd)) {
    3256             :                 unlock_page(page);
    3257             :                 put_page(page);
    3258             :                 return true;
    3259             :         }
    3260             : 
    3261           0 :         if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
    3262             :                 vm_fault_t ret = do_set_pmd(vmf, page);
    3263             :                 if (!ret) {
    3264             :                         /* The page is mapped successfully, reference consumed. */
    3265             :                         unlock_page(page);
    3266             :                         return true;
    3267             :                 }
    3268             :         }
    3269             : 
    3270           0 :         if (pmd_none(*vmf->pmd))
    3271           0 :                 pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
    3272             : 
    3273             :         /* See comment in handle_pte_fault() */
    3274           0 :         if (pmd_devmap_trans_unstable(vmf->pmd)) {
    3275             :                 unlock_page(page);
    3276             :                 put_page(page);
    3277             :                 return true;
    3278             :         }
    3279             : 
    3280             :         return false;
    3281             : }
    3282             : 
    3283           0 : static struct folio *next_uptodate_page(struct folio *folio,
    3284             :                                        struct address_space *mapping,
    3285             :                                        struct xa_state *xas, pgoff_t end_pgoff)
    3286             : {
    3287             :         unsigned long max_idx;
    3288             : 
    3289             :         do {
    3290           0 :                 if (!folio)
    3291             :                         return NULL;
    3292           0 :                 if (xas_retry(xas, folio))
    3293           0 :                         continue;
    3294           0 :                 if (xa_is_value(folio))
    3295           0 :                         continue;
    3296           0 :                 if (folio_test_locked(folio))
    3297           0 :                         continue;
    3298           0 :                 if (!folio_try_get_rcu(folio))
    3299             :                         continue;
    3300             :                 /* Has the page moved or been split? */
    3301           0 :                 if (unlikely(folio != xas_reload(xas)))
    3302             :                         goto skip;
    3303           0 :                 if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
    3304             :                         goto skip;
    3305           0 :                 if (!folio_trylock(folio))
    3306             :                         goto skip;
    3307           0 :                 if (folio->mapping != mapping)
    3308             :                         goto unlock;
    3309           0 :                 if (!folio_test_uptodate(folio))
    3310             :                         goto unlock;
    3311           0 :                 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
    3312           0 :                 if (xas->xa_index >= max_idx)
    3313             :                         goto unlock;
    3314             :                 return folio;
    3315             : unlock:
    3316             :                 folio_unlock(folio);
    3317             : skip:
    3318             :                 folio_put(folio);
    3319           0 :         } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
    3320             : 
    3321             :         return NULL;
    3322             : }
    3323             : 
    3324           0 : static inline struct folio *first_map_page(struct address_space *mapping,
    3325             :                                           struct xa_state *xas,
    3326             :                                           pgoff_t end_pgoff)
    3327             : {
    3328           0 :         return next_uptodate_page(xas_find(xas, end_pgoff),
    3329             :                                   mapping, xas, end_pgoff);
    3330             : }
    3331             : 
    3332           0 : static inline struct folio *next_map_page(struct address_space *mapping,
    3333             :                                          struct xa_state *xas,
    3334             :                                          pgoff_t end_pgoff)
    3335             : {
    3336           0 :         return next_uptodate_page(xas_next_entry(xas, end_pgoff),
    3337             :                                   mapping, xas, end_pgoff);
    3338             : }
    3339             : 
    3340           0 : vm_fault_t filemap_map_pages(struct vm_fault *vmf,
    3341             :                              pgoff_t start_pgoff, pgoff_t end_pgoff)
    3342             : {
    3343           0 :         struct vm_area_struct *vma = vmf->vma;
    3344           0 :         struct file *file = vma->vm_file;
    3345           0 :         struct address_space *mapping = file->f_mapping;
    3346           0 :         pgoff_t last_pgoff = start_pgoff;
    3347             :         unsigned long addr;
    3348           0 :         XA_STATE(xas, &mapping->i_pages, start_pgoff);
    3349             :         struct folio *folio;
    3350             :         struct page *page;
    3351           0 :         unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
    3352           0 :         vm_fault_t ret = 0;
    3353             : 
    3354             :         rcu_read_lock();
    3355           0 :         folio = first_map_page(mapping, &xas, end_pgoff);
    3356           0 :         if (!folio)
    3357             :                 goto out;
    3358             : 
    3359           0 :         if (filemap_map_pmd(vmf, &folio->page)) {
    3360             :                 ret = VM_FAULT_NOPAGE;
    3361             :                 goto out;
    3362             :         }
    3363             : 
    3364           0 :         addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
    3365           0 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
    3366             :         do {
    3367             : again:
    3368           0 :                 page = folio_file_page(folio, xas.xa_index);
    3369             :                 if (PageHWPoison(page))
    3370             :                         goto unlock;
    3371             : 
    3372           0 :                 if (mmap_miss > 0)
    3373           0 :                         mmap_miss--;
    3374             : 
    3375           0 :                 addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
    3376           0 :                 vmf->pte += xas.xa_index - last_pgoff;
    3377           0 :                 last_pgoff = xas.xa_index;
    3378             : 
    3379           0 :                 if (!pte_none(*vmf->pte))
    3380             :                         goto unlock;
    3381             : 
    3382             :                 /* We're about to handle the fault */
    3383           0 :                 if (vmf->address == addr)
    3384           0 :                         ret = VM_FAULT_NOPAGE;
    3385             : 
    3386           0 :                 do_set_pte(vmf, page, addr);
    3387             :                 /* no need to invalidate: a not-present page won't be cached */
    3388             :                 update_mmu_cache(vma, addr, vmf->pte);
    3389           0 :                 if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
    3390           0 :                         xas.xa_index++;
    3391             :                         folio_ref_inc(folio);
    3392             :                         goto again;
    3393             :                 }
    3394           0 :                 folio_unlock(folio);
    3395           0 :                 continue;
    3396             : unlock:
    3397           0 :                 if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
    3398           0 :                         xas.xa_index++;
    3399           0 :                         goto again;
    3400             :                 }
    3401           0 :                 folio_unlock(folio);
    3402             :                 folio_put(folio);
    3403           0 :         } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
    3404           0 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3405             : out:
    3406             :         rcu_read_unlock();
    3407           0 :         WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
    3408           0 :         return ret;
    3409             : }
    3410             : EXPORT_SYMBOL(filemap_map_pages);
    3411             : 
    3412           0 : vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
    3413             : {
    3414           0 :         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
    3415           0 :         struct folio *folio = page_folio(vmf->page);
    3416           0 :         vm_fault_t ret = VM_FAULT_LOCKED;
    3417             : 
    3418           0 :         sb_start_pagefault(mapping->host->i_sb);
    3419           0 :         file_update_time(vmf->vma->vm_file);
    3420           0 :         folio_lock(folio);
    3421           0 :         if (folio->mapping != mapping) {
    3422             :                 folio_unlock(folio);
    3423             :                 ret = VM_FAULT_NOPAGE;
    3424             :                 goto out;
    3425             :         }
    3426             :         /*
    3427             :          * We mark the folio dirty already here so that when freeze is in
    3428             :          * progress, we are guaranteed that writeback during freezing will
    3429             :          * see the dirty folio and writeprotect it again.
    3430             :          */
    3431           0 :         folio_mark_dirty(folio);
    3432           0 :         folio_wait_stable(folio);
    3433             : out:
    3434           0 :         sb_end_pagefault(mapping->host->i_sb);
    3435           0 :         return ret;
    3436             : }
    3437             : 
    3438             : const struct vm_operations_struct generic_file_vm_ops = {
    3439             :         .fault          = filemap_fault,
    3440             :         .map_pages      = filemap_map_pages,
    3441             :         .page_mkwrite   = filemap_page_mkwrite,
    3442             : };
    3443             : 
    3444             : /* This is used for a general mmap of a disk file */
    3445             : 
    3446           0 : int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
    3447             : {
    3448           0 :         struct address_space *mapping = file->f_mapping;
    3449             : 
    3450           0 :         if (!mapping->a_ops->readpage)
    3451             :                 return -ENOEXEC;
    3452           0 :         file_accessed(file);
    3453           0 :         vma->vm_ops = &generic_file_vm_ops;
    3454           0 :         return 0;
    3455             : }
    3456             : 
    3457             : /*
    3458             :  * This is for filesystems which do not implement ->writepage.
    3459             :  */
    3460           0 : int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
    3461             : {
    3462           0 :         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
    3463             :                 return -EINVAL;
    3464             :         return generic_file_mmap(file, vma);
    3465             : }
    3466             : #else
    3467             : vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
    3468             : {
    3469             :         return VM_FAULT_SIGBUS;
    3470             : }
    3471             : int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
    3472             : {
    3473             :         return -ENOSYS;
    3474             : }
    3475             : int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
    3476             : {
    3477             :         return -ENOSYS;
    3478             : }
    3479             : #endif /* CONFIG_MMU */
    3480             : 
    3481             : EXPORT_SYMBOL(filemap_page_mkwrite);
    3482             : EXPORT_SYMBOL(generic_file_mmap);
    3483             : EXPORT_SYMBOL(generic_file_readonly_mmap);
    3484             : 
    3485           0 : static struct folio *do_read_cache_folio(struct address_space *mapping,
    3486             :                 pgoff_t index, filler_t filler, void *data, gfp_t gfp)
    3487             : {
    3488             :         struct folio *folio;
    3489             :         int err;
    3490             : repeat:
    3491           0 :         folio = filemap_get_folio(mapping, index);
    3492           0 :         if (!folio) {
    3493           0 :                 folio = filemap_alloc_folio(gfp, 0);
    3494           0 :                 if (!folio)
    3495             :                         return ERR_PTR(-ENOMEM);
    3496           0 :                 err = filemap_add_folio(mapping, folio, index, gfp);
    3497           0 :                 if (unlikely(err)) {
    3498           0 :                         folio_put(folio);
    3499           0 :                         if (err == -EEXIST)
    3500             :                                 goto repeat;
    3501             :                         /* Presumably ENOMEM for xarray node */
    3502           0 :                         return ERR_PTR(err);
    3503             :                 }
    3504             : 
    3505             : filler:
    3506           0 :                 if (filler)
    3507           0 :                         err = filler(data, &folio->page);
    3508             :                 else
    3509           0 :                         err = mapping->a_ops->readpage(data, &folio->page);
    3510             : 
    3511           0 :                 if (err < 0) {
    3512           0 :                         folio_put(folio);
    3513           0 :                         return ERR_PTR(err);
    3514             :                 }
    3515             : 
    3516           0 :                 folio_wait_locked(folio);
    3517           0 :                 if (!folio_test_uptodate(folio)) {
    3518             :                         folio_put(folio);
    3519             :                         return ERR_PTR(-EIO);
    3520             :                 }
    3521             : 
    3522             :                 goto out;
    3523             :         }
    3524           0 :         if (folio_test_uptodate(folio))
    3525             :                 goto out;
    3526             : 
    3527           0 :         if (!folio_trylock(folio)) {
    3528             :                 folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
    3529             :                 goto repeat;
    3530             :         }
    3531             : 
    3532             :         /* Folio was truncated from mapping */
    3533           0 :         if (!folio->mapping) {
    3534           0 :                 folio_unlock(folio);
    3535             :                 folio_put(folio);
    3536             :                 goto repeat;
    3537             :         }
    3538             : 
    3539             :         /* Someone else locked and filled the page in a very small window */
    3540           0 :         if (folio_test_uptodate(folio)) {
    3541             :                 folio_unlock(folio);
    3542             :                 goto out;
    3543             :         }
    3544             : 
    3545             :         /*
    3546             :          * A previous I/O error may have been due to temporary
    3547             :          * failures.
    3548             :          * Clear page error before actual read, PG_error will be
    3549             :          * set again if read page fails.
    3550             :          */
    3551             :         folio_clear_error(folio);
    3552             :         goto filler;
    3553             : 
    3554             : out:
    3555           0 :         folio_mark_accessed(folio);
    3556           0 :         return folio;
    3557             : }
    3558             : 
    3559             : /**
    3560             :  * read_cache_folio - read into page cache, fill it if needed
    3561             :  * @mapping:    the page's address_space
    3562             :  * @index:      the page index
    3563             :  * @filler:     function to perform the read
    3564             :  * @data:       first arg to filler(data, page) function, often left as NULL
    3565             :  *
    3566             :  * Read into the page cache. If a page already exists, and PageUptodate() is
    3567             :  * not set, try to fill the page and wait for it to become unlocked.
    3568             :  *
    3569             :  * If the page does not get brought uptodate, return -EIO.
    3570             :  *
    3571             :  * The function expects mapping->invalidate_lock to be already held.
    3572             :  *
    3573             :  * Return: up to date page on success, ERR_PTR() on failure.
    3574             :  */
    3575           0 : struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
    3576             :                 filler_t filler, void *data)
    3577             : {
    3578           0 :         return do_read_cache_folio(mapping, index, filler, data,
    3579             :                         mapping_gfp_mask(mapping));
    3580             : }
    3581             : EXPORT_SYMBOL(read_cache_folio);
    3582             : 
    3583           0 : static struct page *do_read_cache_page(struct address_space *mapping,
    3584             :                 pgoff_t index, filler_t *filler, void *data, gfp_t gfp)
    3585             : {
    3586             :         struct folio *folio;
    3587             : 
    3588           0 :         folio = do_read_cache_folio(mapping, index, filler, data, gfp);
    3589           0 :         if (IS_ERR(folio))
    3590           0 :                 return &folio->page;
    3591           0 :         return folio_file_page(folio, index);
    3592             : }
    3593             : 
    3594           0 : struct page *read_cache_page(struct address_space *mapping,
    3595             :                                 pgoff_t index, filler_t *filler, void *data)
    3596             : {
    3597           0 :         return do_read_cache_page(mapping, index, filler, data,
    3598             :                         mapping_gfp_mask(mapping));
    3599             : }
    3600             : EXPORT_SYMBOL(read_cache_page);
    3601             : 
    3602             : /**
    3603             :  * read_cache_page_gfp - read into page cache, using specified page allocation flags.
    3604             :  * @mapping:    the page's address_space
    3605             :  * @index:      the page index
    3606             :  * @gfp:        the page allocator flags to use if allocating
    3607             :  *
    3608             :  * This is the same as "read_mapping_page(mapping, index, NULL)", but with
    3609             :  * any new page allocations done using the specified allocation flags.
    3610             :  *
    3611             :  * If the page does not get brought uptodate, return -EIO.
    3612             :  *
    3613             :  * The function expects mapping->invalidate_lock to be already held.
    3614             :  *
    3615             :  * Return: up to date page on success, ERR_PTR() on failure.
    3616             :  */
    3617           0 : struct page *read_cache_page_gfp(struct address_space *mapping,
    3618             :                                 pgoff_t index,
    3619             :                                 gfp_t gfp)
    3620             : {
    3621           0 :         return do_read_cache_page(mapping, index, NULL, NULL, gfp);
    3622             : }
    3623             : EXPORT_SYMBOL(read_cache_page_gfp);
    3624             : 
    3625           0 : int pagecache_write_begin(struct file *file, struct address_space *mapping,
    3626             :                                 loff_t pos, unsigned len, unsigned flags,
    3627             :                                 struct page **pagep, void **fsdata)
    3628             : {
    3629           0 :         const struct address_space_operations *aops = mapping->a_ops;
    3630             : 
    3631           0 :         return aops->write_begin(file, mapping, pos, len, flags,
    3632             :                                                         pagep, fsdata);
    3633             : }
    3634             : EXPORT_SYMBOL(pagecache_write_begin);
    3635             : 
    3636           0 : int pagecache_write_end(struct file *file, struct address_space *mapping,
    3637             :                                 loff_t pos, unsigned len, unsigned copied,
    3638             :                                 struct page *page, void *fsdata)
    3639             : {
    3640           0 :         const struct address_space_operations *aops = mapping->a_ops;
    3641             : 
    3642           0 :         return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
    3643             : }
    3644             : EXPORT_SYMBOL(pagecache_write_end);
    3645             : 
    3646             : /*
    3647             :  * Warn about a page cache invalidation failure during a direct I/O write.
    3648             :  */
    3649           0 : void dio_warn_stale_pagecache(struct file *filp)
    3650             : {
    3651             :         static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
    3652             :         char pathname[128];
    3653             :         char *path;
    3654             : 
    3655           0 :         errseq_set(&filp->f_mapping->wb_err, -EIO);
    3656           0 :         if (__ratelimit(&_rs)) {
    3657           0 :                 path = file_path(filp, pathname, sizeof(pathname));
    3658           0 :                 if (IS_ERR(path))
    3659           0 :                         path = "(unknown)";
    3660           0 :                 pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
    3661           0 :                 pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
    3662             :                         current->comm);
    3663             :         }
    3664           0 : }
    3665             : 
    3666             : ssize_t
    3667           0 : generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
    3668             : {
    3669           0 :         struct file     *file = iocb->ki_filp;
    3670           0 :         struct address_space *mapping = file->f_mapping;
    3671           0 :         struct inode    *inode = mapping->host;
    3672           0 :         loff_t          pos = iocb->ki_pos;
    3673             :         ssize_t         written;
    3674             :         size_t          write_len;
    3675             :         pgoff_t         end;
    3676             : 
    3677           0 :         write_len = iov_iter_count(from);
    3678           0 :         end = (pos + write_len - 1) >> PAGE_SHIFT;
    3679             : 
    3680           0 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    3681             :                 /* If there are pages to writeback, return */
    3682           0 :                 if (filemap_range_has_page(file->f_mapping, pos,
    3683             :                                            pos + write_len - 1))
    3684             :                         return -EAGAIN;
    3685             :         } else {
    3686           0 :                 written = filemap_write_and_wait_range(mapping, pos,
    3687             :                                                         pos + write_len - 1);
    3688           0 :                 if (written)
    3689             :                         goto out;
    3690             :         }
    3691             : 
    3692             :         /*
    3693             :          * After a write we want buffered reads to be sure to go to disk to get
    3694             :          * the new data.  We invalidate clean cached page from the region we're
    3695             :          * about to write.  We do this *before* the write so that we can return
    3696             :          * without clobbering -EIOCBQUEUED from ->direct_IO().
    3697             :          */
    3698           0 :         written = invalidate_inode_pages2_range(mapping,
    3699           0 :                                         pos >> PAGE_SHIFT, end);
    3700             :         /*
    3701             :          * If a page can not be invalidated, return 0 to fall back
    3702             :          * to buffered write.
    3703             :          */
    3704           0 :         if (written) {
    3705           0 :                 if (written == -EBUSY)
    3706             :                         return 0;
    3707             :                 goto out;
    3708             :         }
    3709             : 
    3710           0 :         written = mapping->a_ops->direct_IO(iocb, from);
    3711             : 
    3712             :         /*
    3713             :          * Finally, try again to invalidate clean pages which might have been
    3714             :          * cached by non-direct readahead, or faulted in by get_user_pages()
    3715             :          * if the source of the write was an mmap'ed region of the file
    3716             :          * we're writing.  Either one is a pretty crazy thing to do,
    3717             :          * so we don't support it 100%.  If this invalidation
    3718             :          * fails, tough, the write still worked...
    3719             :          *
    3720             :          * Most of the time we do not need this since dio_complete() will do
    3721             :          * the invalidation for us. However there are some file systems that
    3722             :          * do not end up with dio_complete() being called, so let's not break
    3723             :          * them by removing it completely.
    3724             :          *
    3725             :          * Noticeable example is a blkdev_direct_IO().
    3726             :          *
    3727             :          * Skip invalidation for async writes or if mapping has no pages.
    3728             :          */
    3729           0 :         if (written > 0 && mapping->nrpages &&
    3730           0 :             invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
    3731           0 :                 dio_warn_stale_pagecache(file);
    3732             : 
    3733           0 :         if (written > 0) {
    3734           0 :                 pos += written;
    3735           0 :                 write_len -= written;
    3736           0 :                 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
    3737           0 :                         i_size_write(inode, pos);
    3738             :                         mark_inode_dirty(inode);
    3739             :                 }
    3740           0 :                 iocb->ki_pos = pos;
    3741             :         }
    3742           0 :         if (written != -EIOCBQUEUED)
    3743           0 :                 iov_iter_revert(from, write_len - iov_iter_count(from));
    3744             : out:
    3745             :         return written;
    3746             : }
    3747             : EXPORT_SYMBOL(generic_file_direct_write);
    3748             : 
    3749           0 : ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
    3750             : {
    3751           0 :         struct file *file = iocb->ki_filp;
    3752           0 :         loff_t pos = iocb->ki_pos;
    3753           0 :         struct address_space *mapping = file->f_mapping;
    3754           0 :         const struct address_space_operations *a_ops = mapping->a_ops;
    3755           0 :         long status = 0;
    3756           0 :         ssize_t written = 0;
    3757           0 :         unsigned int flags = 0;
    3758             : 
    3759             :         do {
    3760             :                 struct page *page;
    3761             :                 unsigned long offset;   /* Offset into pagecache page */
    3762             :                 unsigned long bytes;    /* Bytes to write to page */
    3763             :                 size_t copied;          /* Bytes copied from user */
    3764             :                 void *fsdata;
    3765             : 
    3766           0 :                 offset = (pos & (PAGE_SIZE - 1));
    3767           0 :                 bytes = min_t(unsigned long, PAGE_SIZE - offset,
    3768             :                                                 iov_iter_count(i));
    3769             : 
    3770             : again:
    3771             :                 /*
    3772             :                  * Bring in the user page that we will copy from _first_.
    3773             :                  * Otherwise there's a nasty deadlock on copying from the
    3774             :                  * same page as we're writing to, without it being marked
    3775             :                  * up-to-date.
    3776             :                  */
    3777           0 :                 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
    3778             :                         status = -EFAULT;
    3779           0 :                         break;
    3780             :                 }
    3781             : 
    3782           0 :                 if (fatal_signal_pending(current)) {
    3783             :                         status = -EINTR;
    3784             :                         break;
    3785             :                 }
    3786             : 
    3787           0 :                 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
    3788             :                                                 &page, &fsdata);
    3789           0 :                 if (unlikely(status < 0))
    3790             :                         break;
    3791             : 
    3792           0 :                 if (mapping_writably_mapped(mapping))
    3793             :                         flush_dcache_page(page);
    3794             : 
    3795           0 :                 copied = copy_page_from_iter_atomic(page, offset, bytes, i);
    3796           0 :                 flush_dcache_page(page);
    3797             : 
    3798           0 :                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
    3799             :                                                 page, fsdata);
    3800           0 :                 if (unlikely(status != copied)) {
    3801           0 :                         iov_iter_revert(i, copied - max(status, 0L));
    3802           0 :                         if (unlikely(status < 0))
    3803             :                                 break;
    3804             :                 }
    3805           0 :                 cond_resched();
    3806             : 
    3807           0 :                 if (unlikely(status == 0)) {
    3808             :                         /*
    3809             :                          * A short copy made ->write_end() reject the
    3810             :                          * thing entirely.  Might be memory poisoning
    3811             :                          * halfway through, might be a race with munmap,
    3812             :                          * might be severe memory pressure.
    3813             :                          */
    3814           0 :                         if (copied)
    3815           0 :                                 bytes = copied;
    3816             :                         goto again;
    3817             :                 }
    3818           0 :                 pos += status;
    3819           0 :                 written += status;
    3820             : 
    3821           0 :                 balance_dirty_pages_ratelimited(mapping);
    3822           0 :         } while (iov_iter_count(i));
    3823             : 
    3824           0 :         return written ? written : status;
    3825             : }
    3826             : EXPORT_SYMBOL(generic_perform_write);
    3827             : 
    3828             : /**
    3829             :  * __generic_file_write_iter - write data to a file
    3830             :  * @iocb:       IO state structure (file, offset, etc.)
    3831             :  * @from:       iov_iter with data to write
    3832             :  *
    3833             :  * This function does all the work needed for actually writing data to a
    3834             :  * file. It does all basic checks, removes SUID from the file, updates
    3835             :  * modification times and calls proper subroutines depending on whether we
    3836             :  * do direct IO or a standard buffered write.
    3837             :  *
    3838             :  * It expects i_rwsem to be grabbed unless we work on a block device or similar
    3839             :  * object which does not need locking at all.
    3840             :  *
    3841             :  * This function does *not* take care of syncing data in case of O_SYNC write.
    3842             :  * A caller has to handle it. This is mainly due to the fact that we want to
    3843             :  * avoid syncing under i_rwsem.
    3844             :  *
    3845             :  * Return:
    3846             :  * * number of bytes written, even for truncated writes
    3847             :  * * negative error code if no data has been written at all
    3848             :  */
    3849           0 : ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    3850             : {
    3851           0 :         struct file *file = iocb->ki_filp;
    3852           0 :         struct address_space *mapping = file->f_mapping;
    3853           0 :         struct inode    *inode = mapping->host;
    3854           0 :         ssize_t         written = 0;
    3855             :         ssize_t         err;
    3856             :         ssize_t         status;
    3857             : 
    3858             :         /* We can write back this queue in page reclaim */
    3859           0 :         current->backing_dev_info = inode_to_bdi(inode);
    3860           0 :         err = file_remove_privs(file);
    3861           0 :         if (err)
    3862             :                 goto out;
    3863             : 
    3864           0 :         err = file_update_time(file);
    3865           0 :         if (err)
    3866             :                 goto out;
    3867             : 
    3868           0 :         if (iocb->ki_flags & IOCB_DIRECT) {
    3869             :                 loff_t pos, endbyte;
    3870             : 
    3871           0 :                 written = generic_file_direct_write(iocb, from);
    3872             :                 /*
    3873             :                  * If the write stopped short of completing, fall back to
    3874             :                  * buffered writes.  Some filesystems do this for writes to
    3875             :                  * holes, for example.  For DAX files, a buffered write will
    3876             :                  * not succeed (even if it did, DAX does not handle dirty
    3877             :                  * page-cache pages correctly).
    3878             :                  */
    3879           0 :                 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
    3880             :                         goto out;
    3881             : 
    3882           0 :                 pos = iocb->ki_pos;
    3883           0 :                 status = generic_perform_write(iocb, from);
    3884             :                 /*
    3885             :                  * If generic_perform_write() returned a synchronous error
    3886             :                  * then we want to return the number of bytes which were
    3887             :                  * direct-written, or the error code if that was zero.  Note
    3888             :                  * that this differs from normal direct-io semantics, which
    3889             :                  * will return -EFOO even if some bytes were written.
    3890             :                  */
    3891           0 :                 if (unlikely(status < 0)) {
    3892             :                         err = status;
    3893             :                         goto out;
    3894             :                 }
    3895             :                 /*
    3896             :                  * We need to ensure that the page cache pages are written to
    3897             :                  * disk and invalidated to preserve the expected O_DIRECT
    3898             :                  * semantics.
    3899             :                  */
    3900           0 :                 endbyte = pos + status - 1;
    3901           0 :                 err = filemap_write_and_wait_range(mapping, pos, endbyte);
    3902           0 :                 if (err == 0) {
    3903           0 :                         iocb->ki_pos = endbyte + 1;
    3904           0 :                         written += status;
    3905           0 :                         invalidate_mapping_pages(mapping,
    3906           0 :                                                  pos >> PAGE_SHIFT,
    3907           0 :                                                  endbyte >> PAGE_SHIFT);
    3908             :                 } else {
    3909             :                         /*
    3910             :                          * We don't know how much we wrote, so just return
    3911             :                          * the number of bytes which were direct-written
    3912             :                          */
    3913             :                 }
    3914             :         } else {
    3915           0 :                 written = generic_perform_write(iocb, from);
    3916           0 :                 if (likely(written > 0))
    3917           0 :                         iocb->ki_pos += written;
    3918             :         }
    3919             : out:
    3920           0 :         current->backing_dev_info = NULL;
    3921           0 :         return written ? written : err;
    3922             : }
    3923             : EXPORT_SYMBOL(__generic_file_write_iter);
    3924             : 
    3925             : /**
    3926             :  * generic_file_write_iter - write data to a file
    3927             :  * @iocb:       IO state structure
    3928             :  * @from:       iov_iter with data to write
    3929             :  *
    3930             :  * This is a wrapper around __generic_file_write_iter() to be used by most
    3931             :  * filesystems. It takes care of syncing the file in case of O_SYNC file
    3932             :  * and acquires i_rwsem as needed.
    3933             :  * Return:
    3934             :  * * negative error code if no data has been written at all of
    3935             :  *   vfs_fsync_range() failed for a synchronous write
    3936             :  * * number of bytes written, even for truncated writes
    3937             :  */
    3938           0 : ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    3939             : {
    3940           0 :         struct file *file = iocb->ki_filp;
    3941           0 :         struct inode *inode = file->f_mapping->host;
    3942             :         ssize_t ret;
    3943             : 
    3944           0 :         inode_lock(inode);
    3945           0 :         ret = generic_write_checks(iocb, from);
    3946           0 :         if (ret > 0)
    3947           0 :                 ret = __generic_file_write_iter(iocb, from);
    3948           0 :         inode_unlock(inode);
    3949             : 
    3950           0 :         if (ret > 0)
    3951           0 :                 ret = generic_write_sync(iocb, ret);
    3952           0 :         return ret;
    3953             : }
    3954             : EXPORT_SYMBOL(generic_file_write_iter);
    3955             : 
    3956             : /**
    3957             :  * filemap_release_folio() - Release fs-specific metadata on a folio.
    3958             :  * @folio: The folio which the kernel is trying to free.
    3959             :  * @gfp: Memory allocation flags (and I/O mode).
    3960             :  *
    3961             :  * The address_space is trying to release any data attached to a folio
    3962             :  * (presumably at folio->private).
    3963             :  *
    3964             :  * This will also be called if the private_2 flag is set on a page,
    3965             :  * indicating that the folio has other metadata associated with it.
    3966             :  *
    3967             :  * The @gfp argument specifies whether I/O may be performed to release
    3968             :  * this page (__GFP_IO), and whether the call may block
    3969             :  * (__GFP_RECLAIM & __GFP_FS).
    3970             :  *
    3971             :  * Return: %true if the release was successful, otherwise %false.
    3972             :  */
    3973           0 : bool filemap_release_folio(struct folio *folio, gfp_t gfp)
    3974             : {
    3975           0 :         struct address_space * const mapping = folio->mapping;
    3976             : 
    3977           0 :         BUG_ON(!folio_test_locked(folio));
    3978           0 :         if (folio_test_writeback(folio))
    3979             :                 return false;
    3980             : 
    3981           0 :         if (mapping && mapping->a_ops->releasepage)
    3982           0 :                 return mapping->a_ops->releasepage(&folio->page, gfp);
    3983           0 :         return try_to_free_buffers(&folio->page);
    3984             : }
    3985             : EXPORT_SYMBOL(filemap_release_folio);

Generated by: LCOV version 1.14