LCOV - code coverage report
Current view: top level - mm - swap_state.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 5 325 1.5 %
Date: 2022-12-09 01:23:36 Functions: 1 24 4.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  *  linux/mm/swap_state.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       6             :  *  Swap reorganised 29.12.95, Stephen Tweedie
       7             :  *
       8             :  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
       9             :  */
      10             : #include <linux/mm.h>
      11             : #include <linux/gfp.h>
      12             : #include <linux/kernel_stat.h>
      13             : #include <linux/swap.h>
      14             : #include <linux/swapops.h>
      15             : #include <linux/init.h>
      16             : #include <linux/pagemap.h>
      17             : #include <linux/backing-dev.h>
      18             : #include <linux/blkdev.h>
      19             : #include <linux/pagevec.h>
      20             : #include <linux/migrate.h>
      21             : #include <linux/vmalloc.h>
      22             : #include <linux/swap_slots.h>
      23             : #include <linux/huge_mm.h>
      24             : #include <linux/shmem_fs.h>
      25             : #include "internal.h"
      26             : 
      27             : /*
      28             :  * swapper_space is a fiction, retained to simplify the path through
      29             :  * vmscan's shrink_page_list.
      30             :  */
      31             : static const struct address_space_operations swap_aops = {
      32             :         .writepage      = swap_writepage,
      33             :         .dirty_folio    = swap_dirty_folio,
      34             : #ifdef CONFIG_MIGRATION
      35             :         .migratepage    = migrate_page,
      36             : #endif
      37             : };
      38             : 
      39             : struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
      40             : static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
      41             : static bool enable_vma_readahead __read_mostly = true;
      42             : 
      43             : #define SWAP_RA_WIN_SHIFT       (PAGE_SHIFT / 2)
      44             : #define SWAP_RA_HITS_MASK       ((1UL << SWAP_RA_WIN_SHIFT) - 1)
      45             : #define SWAP_RA_HITS_MAX        SWAP_RA_HITS_MASK
      46             : #define SWAP_RA_WIN_MASK        (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
      47             : 
      48             : #define SWAP_RA_HITS(v)         ((v) & SWAP_RA_HITS_MASK)
      49             : #define SWAP_RA_WIN(v)          (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
      50             : #define SWAP_RA_ADDR(v)         ((v) & PAGE_MASK)
      51             : 
      52             : #define SWAP_RA_VAL(addr, win, hits)                            \
      53             :         (((addr) & PAGE_MASK) |                                     \
      54             :          (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |  \
      55             :          ((hits) & SWAP_RA_HITS_MASK))
      56             : 
      57             : /* Initial readahead hits is 4 to start up with a small window */
      58             : #define GET_SWAP_RA_VAL(vma)                                    \
      59             :         (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
      60             : 
      61             : #define INC_CACHE_INFO(x)       data_race(swap_cache_info.x++)
      62             : #define ADD_CACHE_INFO(x, nr)   data_race(swap_cache_info.x += (nr))
      63             : 
      64             : static struct {
      65             :         unsigned long add_total;
      66             :         unsigned long del_total;
      67             :         unsigned long find_success;
      68             :         unsigned long find_total;
      69             : } swap_cache_info;
      70             : 
      71             : static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
      72             : 
      73           0 : void show_swap_cache_info(void)
      74             : {
      75           0 :         printk("%lu pages in swap cache\n", total_swapcache_pages());
      76           0 :         printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
      77             :                 swap_cache_info.add_total, swap_cache_info.del_total,
      78             :                 swap_cache_info.find_success, swap_cache_info.find_total);
      79           0 :         printk("Free swap  = %ldkB\n",
      80             :                 get_nr_swap_pages() << (PAGE_SHIFT - 10));
      81           0 :         printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
      82           0 : }
      83             : 
      84           0 : void *get_shadow_from_swap_cache(swp_entry_t entry)
      85             : {
      86           0 :         struct address_space *address_space = swap_address_space(entry);
      87           0 :         pgoff_t idx = swp_offset(entry);
      88             :         struct page *page;
      89             : 
      90           0 :         page = xa_load(&address_space->i_pages, idx);
      91           0 :         if (xa_is_value(page))
      92             :                 return page;
      93           0 :         return NULL;
      94             : }
      95             : 
      96             : /*
      97             :  * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
      98             :  * but sets SwapCache flag and private instead of mapping and index.
      99             :  */
     100           0 : int add_to_swap_cache(struct page *page, swp_entry_t entry,
     101             :                         gfp_t gfp, void **shadowp)
     102             : {
     103           0 :         struct address_space *address_space = swap_address_space(entry);
     104           0 :         pgoff_t idx = swp_offset(entry);
     105           0 :         XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
     106           0 :         unsigned long i, nr = thp_nr_pages(page);
     107             :         void *old;
     108             : 
     109             :         VM_BUG_ON_PAGE(!PageLocked(page), page);
     110             :         VM_BUG_ON_PAGE(PageSwapCache(page), page);
     111             :         VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
     112             : 
     113           0 :         page_ref_add(page, nr);
     114             :         SetPageSwapCache(page);
     115             : 
     116             :         do {
     117           0 :                 xas_lock_irq(&xas);
     118           0 :                 xas_create_range(&xas);
     119           0 :                 if (xas_error(&xas))
     120             :                         goto unlock;
     121           0 :                 for (i = 0; i < nr; i++) {
     122             :                         VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
     123           0 :                         old = xas_load(&xas);
     124           0 :                         if (xa_is_value(old)) {
     125           0 :                                 if (shadowp)
     126           0 :                                         *shadowp = old;
     127             :                         }
     128           0 :                         set_page_private(page + i, entry.val + i);
     129           0 :                         xas_store(&xas, page);
     130           0 :                         xas_next(&xas);
     131             :                 }
     132           0 :                 address_space->nrpages += nr;
     133           0 :                 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
     134           0 :                 __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
     135           0 :                 ADD_CACHE_INFO(add_total, nr);
     136             : unlock:
     137           0 :                 xas_unlock_irq(&xas);
     138           0 :         } while (xas_nomem(&xas, gfp));
     139             : 
     140           0 :         if (!xas_error(&xas))
     141             :                 return 0;
     142             : 
     143           0 :         ClearPageSwapCache(page);
     144           0 :         page_ref_sub(page, nr);
     145           0 :         return xas_error(&xas);
     146             : }
     147             : 
     148             : /*
     149             :  * This must be called only on pages that have
     150             :  * been verified to be in the swap cache.
     151             :  */
     152           0 : void __delete_from_swap_cache(struct page *page,
     153             :                         swp_entry_t entry, void *shadow)
     154             : {
     155           0 :         struct address_space *address_space = swap_address_space(entry);
     156           0 :         int i, nr = thp_nr_pages(page);
     157           0 :         pgoff_t idx = swp_offset(entry);
     158           0 :         XA_STATE(xas, &address_space->i_pages, idx);
     159             : 
     160             :         VM_BUG_ON_PAGE(!PageLocked(page), page);
     161             :         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
     162             :         VM_BUG_ON_PAGE(PageWriteback(page), page);
     163             : 
     164           0 :         for (i = 0; i < nr; i++) {
     165           0 :                 void *entry = xas_store(&xas, shadow);
     166             :                 VM_BUG_ON_PAGE(entry != page, entry);
     167           0 :                 set_page_private(page + i, 0);
     168           0 :                 xas_next(&xas);
     169             :         }
     170           0 :         ClearPageSwapCache(page);
     171           0 :         address_space->nrpages -= nr;
     172           0 :         __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
     173           0 :         __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
     174           0 :         ADD_CACHE_INFO(del_total, nr);
     175           0 : }
     176             : 
     177             : /**
     178             :  * add_to_swap - allocate swap space for a page
     179             :  * @page: page we want to move to swap
     180             :  *
     181             :  * Allocate swap space for the page and add the page to the
     182             :  * swap cache.  Caller needs to hold the page lock. 
     183             :  */
     184           0 : int add_to_swap(struct page *page)
     185             : {
     186             :         swp_entry_t entry;
     187             :         int err;
     188             : 
     189             :         VM_BUG_ON_PAGE(!PageLocked(page), page);
     190             :         VM_BUG_ON_PAGE(!PageUptodate(page), page);
     191             : 
     192           0 :         entry = get_swap_page(page);
     193           0 :         if (!entry.val)
     194             :                 return 0;
     195             : 
     196             :         /*
     197             :          * XArray node allocations from PF_MEMALLOC contexts could
     198             :          * completely exhaust the page allocator. __GFP_NOMEMALLOC
     199             :          * stops emergency reserves from being allocated.
     200             :          *
     201             :          * TODO: this could cause a theoretical memory reclaim
     202             :          * deadlock in the swap out path.
     203             :          */
     204             :         /*
     205             :          * Add it to the swap cache.
     206             :          */
     207           0 :         err = add_to_swap_cache(page, entry,
     208             :                         __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
     209           0 :         if (err)
     210             :                 /*
     211             :                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
     212             :                  * clear SWAP_HAS_CACHE flag.
     213             :                  */
     214             :                 goto fail;
     215             :         /*
     216             :          * Normally the page will be dirtied in unmap because its pte should be
     217             :          * dirty. A special case is MADV_FREE page. The page's pte could have
     218             :          * dirty bit cleared but the page's SwapBacked bit is still set because
     219             :          * clearing the dirty bit and SwapBacked bit has no lock protected. For
     220             :          * such page, unmap will not set dirty bit for it, so page reclaim will
     221             :          * not write the page out. This can cause data corruption when the page
     222             :          * is swap in later. Always setting the dirty bit for the page solves
     223             :          * the problem.
     224             :          */
     225           0 :         set_page_dirty(page);
     226             : 
     227           0 :         return 1;
     228             : 
     229             : fail:
     230           0 :         put_swap_page(page, entry);
     231           0 :         return 0;
     232             : }
     233             : 
     234             : /*
     235             :  * This must be called only on pages that have
     236             :  * been verified to be in the swap cache and locked.
     237             :  * It will never put the page into the free list,
     238             :  * the caller has a reference on the page.
     239             :  */
     240           0 : void delete_from_swap_cache(struct page *page)
     241             : {
     242           0 :         swp_entry_t entry = { .val = page_private(page) };
     243           0 :         struct address_space *address_space = swap_address_space(entry);
     244             : 
     245           0 :         xa_lock_irq(&address_space->i_pages);
     246           0 :         __delete_from_swap_cache(page, entry, NULL);
     247           0 :         xa_unlock_irq(&address_space->i_pages);
     248             : 
     249           0 :         put_swap_page(page, entry);
     250           0 :         page_ref_sub(page, thp_nr_pages(page));
     251           0 : }
     252             : 
     253           0 : void clear_shadow_from_swap_cache(int type, unsigned long begin,
     254             :                                 unsigned long end)
     255             : {
     256           0 :         unsigned long curr = begin;
     257             :         void *old;
     258             : 
     259           0 :         for (;;) {
     260           0 :                 swp_entry_t entry = swp_entry(type, curr);
     261           0 :                 struct address_space *address_space = swap_address_space(entry);
     262           0 :                 XA_STATE(xas, &address_space->i_pages, curr);
     263             : 
     264           0 :                 xa_lock_irq(&address_space->i_pages);
     265           0 :                 xas_for_each(&xas, old, end) {
     266           0 :                         if (!xa_is_value(old))
     267           0 :                                 continue;
     268           0 :                         xas_store(&xas, NULL);
     269             :                 }
     270           0 :                 xa_unlock_irq(&address_space->i_pages);
     271             : 
     272             :                 /* search the next swapcache until we meet end */
     273           0 :                 curr >>= SWAP_ADDRESS_SPACE_SHIFT;
     274           0 :                 curr++;
     275           0 :                 curr <<= SWAP_ADDRESS_SPACE_SHIFT;
     276           0 :                 if (curr > end)
     277             :                         break;
     278             :         }
     279           0 : }
     280             : 
     281             : /* 
     282             :  * If we are the only user, then try to free up the swap cache. 
     283             :  * 
     284             :  * Its ok to check for PageSwapCache without the page lock
     285             :  * here because we are going to recheck again inside
     286             :  * try_to_free_swap() _with_ the lock.
     287             :  *                                      - Marcelo
     288             :  */
     289           0 : void free_swap_cache(struct page *page)
     290             : {
     291           0 :         if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
     292           0 :                 try_to_free_swap(page);
     293           0 :                 unlock_page(page);
     294             :         }
     295           0 : }
     296             : 
     297             : /* 
     298             :  * Perform a free_page(), also freeing any swap cache associated with
     299             :  * this page if it is the last user of the page.
     300             :  */
     301           0 : void free_page_and_swap_cache(struct page *page)
     302             : {
     303           0 :         free_swap_cache(page);
     304           0 :         if (!is_huge_zero_page(page))
     305           0 :                 put_page(page);
     306           0 : }
     307             : 
     308             : /*
     309             :  * Passed an array of pages, drop them all from swapcache and then release
     310             :  * them.  They are removed from the LRU and freed if this is their last use.
     311             :  */
     312           0 : void free_pages_and_swap_cache(struct page **pages, int nr)
     313             : {
     314           0 :         struct page **pagep = pages;
     315             :         int i;
     316             : 
     317           0 :         lru_add_drain();
     318           0 :         for (i = 0; i < nr; i++)
     319           0 :                 free_swap_cache(pagep[i]);
     320           0 :         release_pages(pagep, nr);
     321           0 : }
     322             : 
     323             : static inline bool swap_use_vma_readahead(void)
     324             : {
     325           0 :         return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
     326             : }
     327             : 
     328             : /*
     329             :  * Lookup a swap entry in the swap cache. A found page will be returned
     330             :  * unlocked and with its refcount incremented - we rely on the kernel
     331             :  * lock getting page table operations atomic even if we drop the page
     332             :  * lock before returning.
     333             :  */
     334           0 : struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
     335             :                                unsigned long addr)
     336             : {
     337             :         struct page *page;
     338             :         struct swap_info_struct *si;
     339             : 
     340           0 :         si = get_swap_device(entry);
     341           0 :         if (!si)
     342             :                 return NULL;
     343           0 :         page = find_get_page(swap_address_space(entry), swp_offset(entry));
     344           0 :         put_swap_device(si);
     345             : 
     346           0 :         INC_CACHE_INFO(find_total);
     347           0 :         if (page) {
     348           0 :                 bool vma_ra = swap_use_vma_readahead();
     349             :                 bool readahead;
     350             : 
     351           0 :                 INC_CACHE_INFO(find_success);
     352             :                 /*
     353             :                  * At the moment, we don't support PG_readahead for anon THP
     354             :                  * so let's bail out rather than confusing the readahead stat.
     355             :                  */
     356             :                 if (unlikely(PageTransCompound(page)))
     357             :                         return page;
     358             : 
     359           0 :                 readahead = TestClearPageReadahead(page);
     360           0 :                 if (vma && vma_ra) {
     361             :                         unsigned long ra_val;
     362             :                         int win, hits;
     363             : 
     364           0 :                         ra_val = GET_SWAP_RA_VAL(vma);
     365           0 :                         win = SWAP_RA_WIN(ra_val);
     366           0 :                         hits = SWAP_RA_HITS(ra_val);
     367           0 :                         if (readahead)
     368           0 :                                 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
     369           0 :                         atomic_long_set(&vma->swap_readahead_info,
     370           0 :                                         SWAP_RA_VAL(addr, win, hits));
     371             :                 }
     372             : 
     373           0 :                 if (readahead) {
     374           0 :                         count_vm_event(SWAP_RA_HIT);
     375           0 :                         if (!vma || !vma_ra)
     376             :                                 atomic_inc(&swapin_readahead_hits);
     377             :                 }
     378             :         }
     379             : 
     380             :         return page;
     381             : }
     382             : 
     383             : /**
     384             :  * find_get_incore_page - Find and get a page from the page or swap caches.
     385             :  * @mapping: The address_space to search.
     386             :  * @index: The page cache index.
     387             :  *
     388             :  * This differs from find_get_page() in that it will also look for the
     389             :  * page in the swap cache.
     390             :  *
     391             :  * Return: The found page or %NULL.
     392             :  */
     393           0 : struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
     394             : {
     395             :         swp_entry_t swp;
     396             :         struct swap_info_struct *si;
     397           0 :         struct page *page = pagecache_get_page(mapping, index,
     398             :                                                 FGP_ENTRY | FGP_HEAD, 0);
     399             : 
     400           0 :         if (!page)
     401             :                 return page;
     402           0 :         if (!xa_is_value(page))
     403           0 :                 return find_subpage(page, index);
     404           0 :         if (!shmem_mapping(mapping))
     405             :                 return NULL;
     406             : 
     407           0 :         swp = radix_to_swp_entry(page);
     408             :         /* Prevent swapoff from happening to us */
     409           0 :         si = get_swap_device(swp);
     410           0 :         if (!si)
     411             :                 return NULL;
     412           0 :         page = find_get_page(swap_address_space(swp), swp_offset(swp));
     413           0 :         put_swap_device(si);
     414           0 :         return page;
     415             : }
     416             : 
     417           0 : struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
     418             :                         struct vm_area_struct *vma, unsigned long addr,
     419             :                         bool *new_page_allocated)
     420             : {
     421             :         struct swap_info_struct *si;
     422             :         struct page *page;
     423           0 :         void *shadow = NULL;
     424             : 
     425           0 :         *new_page_allocated = false;
     426             : 
     427           0 :         for (;;) {
     428             :                 int err;
     429             :                 /*
     430             :                  * First check the swap cache.  Since this is normally
     431             :                  * called after lookup_swap_cache() failed, re-calling
     432             :                  * that would confuse statistics.
     433             :                  */
     434           0 :                 si = get_swap_device(entry);
     435           0 :                 if (!si)
     436             :                         return NULL;
     437           0 :                 page = find_get_page(swap_address_space(entry),
     438             :                                      swp_offset(entry));
     439           0 :                 put_swap_device(si);
     440           0 :                 if (page)
     441             :                         return page;
     442             : 
     443             :                 /*
     444             :                  * Just skip read ahead for unused swap slot.
     445             :                  * During swap_off when swap_slot_cache is disabled,
     446             :                  * we have to handle the race between putting
     447             :                  * swap entry in swap cache and marking swap slot
     448             :                  * as SWAP_HAS_CACHE.  That's done in later part of code or
     449             :                  * else swap_off will be aborted if we return NULL.
     450             :                  */
     451           0 :                 if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
     452             :                         return NULL;
     453             : 
     454             :                 /*
     455             :                  * Get a new page to read into from swap.  Allocate it now,
     456             :                  * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
     457             :                  * cause any racers to loop around until we add it to cache.
     458             :                  */
     459           0 :                 page = alloc_page_vma(gfp_mask, vma, addr);
     460           0 :                 if (!page)
     461             :                         return NULL;
     462             : 
     463             :                 /*
     464             :                  * Swap entry may have been freed since our caller observed it.
     465             :                  */
     466           0 :                 err = swapcache_prepare(entry);
     467           0 :                 if (!err)
     468             :                         break;
     469             : 
     470           0 :                 put_page(page);
     471           0 :                 if (err != -EEXIST)
     472             :                         return NULL;
     473             : 
     474             :                 /*
     475             :                  * We might race against __delete_from_swap_cache(), and
     476             :                  * stumble across a swap_map entry whose SWAP_HAS_CACHE
     477             :                  * has not yet been cleared.  Or race against another
     478             :                  * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
     479             :                  * in swap_map, but not yet added its page to swap cache.
     480             :                  */
     481           0 :                 schedule_timeout_uninterruptible(1);
     482             :         }
     483             : 
     484             :         /*
     485             :          * The swap entry is ours to swap in. Prepare the new page.
     486             :          */
     487             : 
     488           0 :         __SetPageLocked(page);
     489           0 :         __SetPageSwapBacked(page);
     490             : 
     491           0 :         if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
     492             :                 goto fail_unlock;
     493             : 
     494             :         /* May fail (-ENOMEM) if XArray node allocation failed. */
     495           0 :         if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
     496             :                 goto fail_unlock;
     497             : 
     498           0 :         mem_cgroup_swapin_uncharge_swap(entry);
     499             : 
     500           0 :         if (shadow)
     501           0 :                 workingset_refault(page_folio(page), shadow);
     502             : 
     503             :         /* Caller will initiate read into locked page */
     504           0 :         lru_cache_add(page);
     505           0 :         *new_page_allocated = true;
     506           0 :         return page;
     507             : 
     508             : fail_unlock:
     509           0 :         put_swap_page(page, entry);
     510           0 :         unlock_page(page);
     511           0 :         put_page(page);
     512           0 :         return NULL;
     513             : }
     514             : 
     515             : /*
     516             :  * Locate a page of swap in physical memory, reserving swap cache space
     517             :  * and reading the disk if it is not already cached.
     518             :  * A failure return means that either the page allocation failed or that
     519             :  * the swap entry is no longer in use.
     520             :  */
     521           0 : struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
     522             :                 struct vm_area_struct *vma, unsigned long addr, bool do_poll)
     523             : {
     524             :         bool page_was_allocated;
     525           0 :         struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
     526             :                         vma, addr, &page_was_allocated);
     527             : 
     528           0 :         if (page_was_allocated)
     529           0 :                 swap_readpage(retpage, do_poll);
     530             : 
     531           0 :         return retpage;
     532             : }
     533             : 
     534             : static unsigned int __swapin_nr_pages(unsigned long prev_offset,
     535             :                                       unsigned long offset,
     536             :                                       int hits,
     537             :                                       int max_pages,
     538             :                                       int prev_win)
     539             : {
     540             :         unsigned int pages, last_ra;
     541             : 
     542             :         /*
     543             :          * This heuristic has been found to work well on both sequential and
     544             :          * random loads, swapping to hard disk or to SSD: please don't ask
     545             :          * what the "+ 2" means, it just happens to work well, that's all.
     546             :          */
     547           0 :         pages = hits + 2;
     548           0 :         if (pages == 2) {
     549             :                 /*
     550             :                  * We can have no readahead hits to judge by: but must not get
     551             :                  * stuck here forever, so check for an adjacent offset instead
     552             :                  * (and don't even bother to check whether swap type is same).
     553             :                  */
     554           0 :                 if (offset != prev_offset + 1 && offset != prev_offset - 1)
     555           0 :                         pages = 1;
     556             :         } else {
     557             :                 unsigned int roundup = 4;
     558           0 :                 while (roundup < pages)
     559           0 :                         roundup <<= 1;
     560             :                 pages = roundup;
     561             :         }
     562             : 
     563           0 :         if (pages > max_pages)
     564           0 :                 pages = max_pages;
     565             : 
     566             :         /* Don't shrink readahead too fast */
     567           0 :         last_ra = prev_win / 2;
     568           0 :         if (pages < last_ra)
     569           0 :                 pages = last_ra;
     570             : 
     571             :         return pages;
     572             : }
     573             : 
     574           0 : static unsigned long swapin_nr_pages(unsigned long offset)
     575             : {
     576             :         static unsigned long prev_offset;
     577             :         unsigned int hits, pages, max_pages;
     578             :         static atomic_t last_readahead_pages;
     579             : 
     580           0 :         max_pages = 1 << READ_ONCE(page_cluster);
     581           0 :         if (max_pages <= 1)
     582             :                 return 1;
     583             : 
     584           0 :         hits = atomic_xchg(&swapin_readahead_hits, 0);
     585           0 :         pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
     586             :                                   max_pages,
     587             :                                   atomic_read(&last_readahead_pages));
     588           0 :         if (!hits)
     589           0 :                 WRITE_ONCE(prev_offset, offset);
     590           0 :         atomic_set(&last_readahead_pages, pages);
     591             : 
     592           0 :         return pages;
     593             : }
     594             : 
     595             : /**
     596             :  * swap_cluster_readahead - swap in pages in hope we need them soon
     597             :  * @entry: swap entry of this memory
     598             :  * @gfp_mask: memory allocation flags
     599             :  * @vmf: fault information
     600             :  *
     601             :  * Returns the struct page for entry and addr, after queueing swapin.
     602             :  *
     603             :  * Primitive swap readahead code. We simply read an aligned block of
     604             :  * (1 << page_cluster) entries in the swap area. This method is chosen
     605             :  * because it doesn't cost us any seek time.  We also make sure to queue
     606             :  * the 'original' request together with the readahead ones...
     607             :  *
     608             :  * This has been extended to use the NUMA policies from the mm triggering
     609             :  * the readahead.
     610             :  *
     611             :  * Caller must hold read mmap_lock if vmf->vma is not NULL.
     612             :  */
     613           0 : struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
     614             :                                 struct vm_fault *vmf)
     615             : {
     616             :         struct page *page;
     617           0 :         unsigned long entry_offset = swp_offset(entry);
     618           0 :         unsigned long offset = entry_offset;
     619             :         unsigned long start_offset, end_offset;
     620             :         unsigned long mask;
     621           0 :         struct swap_info_struct *si = swp_swap_info(entry);
     622             :         struct blk_plug plug;
     623           0 :         bool do_poll = true, page_allocated;
     624           0 :         struct vm_area_struct *vma = vmf->vma;
     625           0 :         unsigned long addr = vmf->address;
     626             : 
     627           0 :         mask = swapin_nr_pages(offset) - 1;
     628           0 :         if (!mask)
     629             :                 goto skip;
     630             : 
     631           0 :         do_poll = false;
     632             :         /* Read a page_cluster sized and aligned cluster around offset. */
     633           0 :         start_offset = offset & ~mask;
     634           0 :         end_offset = offset | mask;
     635           0 :         if (!start_offset)      /* First page is swap header. */
     636           0 :                 start_offset++;
     637           0 :         if (end_offset >= si->max)
     638           0 :                 end_offset = si->max - 1;
     639             : 
     640           0 :         blk_start_plug(&plug);
     641           0 :         for (offset = start_offset; offset <= end_offset ; offset++) {
     642             :                 /* Ok, do the async read-ahead now */
     643           0 :                 page = __read_swap_cache_async(
     644             :                         swp_entry(swp_type(entry), offset),
     645             :                         gfp_mask, vma, addr, &page_allocated);
     646           0 :                 if (!page)
     647           0 :                         continue;
     648           0 :                 if (page_allocated) {
     649           0 :                         swap_readpage(page, false);
     650           0 :                         if (offset != entry_offset) {
     651           0 :                                 SetPageReadahead(page);
     652           0 :                                 count_vm_event(SWAP_RA);
     653             :                         }
     654             :                 }
     655           0 :                 put_page(page);
     656             :         }
     657           0 :         blk_finish_plug(&plug);
     658             : 
     659           0 :         lru_add_drain();        /* Push any new pages onto the LRU now */
     660             : skip:
     661           0 :         return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
     662             : }
     663             : 
     664           0 : int init_swap_address_space(unsigned int type, unsigned long nr_pages)
     665             : {
     666             :         struct address_space *spaces, *space;
     667             :         unsigned int i, nr;
     668             : 
     669           0 :         nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
     670           0 :         spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
     671           0 :         if (!spaces)
     672             :                 return -ENOMEM;
     673           0 :         for (i = 0; i < nr; i++) {
     674           0 :                 space = spaces + i;
     675           0 :                 xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
     676           0 :                 atomic_set(&space->i_mmap_writable, 0);
     677           0 :                 space->a_ops = &swap_aops;
     678             :                 /* swap cache doesn't use writeback related tags */
     679           0 :                 mapping_set_no_writeback_tags(space);
     680             :         }
     681           0 :         nr_swapper_spaces[type] = nr;
     682           0 :         swapper_spaces[type] = spaces;
     683             : 
     684           0 :         return 0;
     685             : }
     686             : 
     687           0 : void exit_swap_address_space(unsigned int type)
     688             : {
     689             :         int i;
     690           0 :         struct address_space *spaces = swapper_spaces[type];
     691             : 
     692           0 :         for (i = 0; i < nr_swapper_spaces[type]; i++)
     693             :                 VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
     694           0 :         kvfree(spaces);
     695           0 :         nr_swapper_spaces[type] = 0;
     696           0 :         swapper_spaces[type] = NULL;
     697           0 : }
     698             : 
     699             : static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
     700             :                                      unsigned long faddr,
     701             :                                      unsigned long lpfn,
     702             :                                      unsigned long rpfn,
     703             :                                      unsigned long *start,
     704             :                                      unsigned long *end)
     705             : {
     706           0 :         *start = max3(lpfn, PFN_DOWN(vma->vm_start),
     707             :                       PFN_DOWN(faddr & PMD_MASK));
     708           0 :         *end = min3(rpfn, PFN_DOWN(vma->vm_end),
     709             :                     PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
     710             : }
     711             : 
     712           0 : static void swap_ra_info(struct vm_fault *vmf,
     713             :                         struct vma_swap_readahead *ra_info)
     714             : {
     715           0 :         struct vm_area_struct *vma = vmf->vma;
     716             :         unsigned long ra_val;
     717             :         unsigned long faddr, pfn, fpfn;
     718             :         unsigned long start, end;
     719             :         pte_t *pte, *orig_pte;
     720             :         unsigned int max_win, hits, prev_win, win, left;
     721             : #ifndef CONFIG_64BIT
     722             :         pte_t *tpte;
     723             : #endif
     724             : 
     725           0 :         max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
     726             :                              SWAP_RA_ORDER_CEILING);
     727           0 :         if (max_win == 1) {
     728           0 :                 ra_info->win = 1;
     729           0 :                 return;
     730             :         }
     731             : 
     732           0 :         faddr = vmf->address;
     733           0 :         orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
     734             : 
     735           0 :         fpfn = PFN_DOWN(faddr);
     736           0 :         ra_val = GET_SWAP_RA_VAL(vma);
     737           0 :         pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
     738           0 :         prev_win = SWAP_RA_WIN(ra_val);
     739           0 :         hits = SWAP_RA_HITS(ra_val);
     740           0 :         ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
     741             :                                                max_win, prev_win);
     742           0 :         atomic_long_set(&vma->swap_readahead_info,
     743           0 :                         SWAP_RA_VAL(faddr, win, 0));
     744             : 
     745           0 :         if (win == 1) {
     746             :                 pte_unmap(orig_pte);
     747             :                 return;
     748             :         }
     749             : 
     750             :         /* Copy the PTEs because the page table may be unmapped */
     751           0 :         if (fpfn == pfn + 1)
     752           0 :                 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
     753           0 :         else if (pfn == fpfn + 1)
     754           0 :                 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
     755             :                                   &start, &end);
     756             :         else {
     757           0 :                 left = (win - 1) / 2;
     758           0 :                 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
     759             :                                   &start, &end);
     760             :         }
     761           0 :         ra_info->nr_pte = end - start;
     762           0 :         ra_info->offset = fpfn - start;
     763           0 :         pte -= ra_info->offset;
     764             : #ifdef CONFIG_64BIT
     765           0 :         ra_info->ptes = pte;
     766             : #else
     767             :         tpte = ra_info->ptes;
     768             :         for (pfn = start; pfn != end; pfn++)
     769             :                 *tpte++ = *pte++;
     770             : #endif
     771             :         pte_unmap(orig_pte);
     772             : }
     773             : 
     774             : /**
     775             :  * swap_vma_readahead - swap in pages in hope we need them soon
     776             :  * @fentry: swap entry of this memory
     777             :  * @gfp_mask: memory allocation flags
     778             :  * @vmf: fault information
     779             :  *
     780             :  * Returns the struct page for entry and addr, after queueing swapin.
     781             :  *
     782             :  * Primitive swap readahead code. We simply read in a few pages whose
     783             :  * virtual addresses are around the fault address in the same vma.
     784             :  *
     785             :  * Caller must hold read mmap_lock if vmf->vma is not NULL.
     786             :  *
     787             :  */
     788           0 : static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
     789             :                                        struct vm_fault *vmf)
     790             : {
     791             :         struct blk_plug plug;
     792           0 :         struct vm_area_struct *vma = vmf->vma;
     793             :         struct page *page;
     794             :         pte_t *pte, pentry;
     795             :         swp_entry_t entry;
     796             :         unsigned int i;
     797             :         bool page_allocated;
     798           0 :         struct vma_swap_readahead ra_info = {
     799             :                 .win = 1,
     800             :         };
     801             : 
     802           0 :         swap_ra_info(vmf, &ra_info);
     803           0 :         if (ra_info.win == 1)
     804             :                 goto skip;
     805             : 
     806           0 :         blk_start_plug(&plug);
     807           0 :         for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
     808           0 :              i++, pte++) {
     809           0 :                 pentry = *pte;
     810           0 :                 if (pte_none(pentry))
     811           0 :                         continue;
     812           0 :                 if (pte_present(pentry))
     813           0 :                         continue;
     814           0 :                 entry = pte_to_swp_entry(pentry);
     815           0 :                 if (unlikely(non_swap_entry(entry)))
     816           0 :                         continue;
     817           0 :                 page = __read_swap_cache_async(entry, gfp_mask, vma,
     818             :                                                vmf->address, &page_allocated);
     819           0 :                 if (!page)
     820           0 :                         continue;
     821           0 :                 if (page_allocated) {
     822           0 :                         swap_readpage(page, false);
     823           0 :                         if (i != ra_info.offset) {
     824           0 :                                 SetPageReadahead(page);
     825           0 :                                 count_vm_event(SWAP_RA);
     826             :                         }
     827             :                 }
     828           0 :                 put_page(page);
     829             :         }
     830           0 :         blk_finish_plug(&plug);
     831           0 :         lru_add_drain();
     832             : skip:
     833           0 :         return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
     834           0 :                                      ra_info.win == 1);
     835             : }
     836             : 
     837             : /**
     838             :  * swapin_readahead - swap in pages in hope we need them soon
     839             :  * @entry: swap entry of this memory
     840             :  * @gfp_mask: memory allocation flags
     841             :  * @vmf: fault information
     842             :  *
     843             :  * Returns the struct page for entry and addr, after queueing swapin.
     844             :  *
     845             :  * It's a main entry function for swap readahead. By the configuration,
     846             :  * it will read ahead blocks by cluster-based(ie, physical disk based)
     847             :  * or vma-based(ie, virtual address based on faulty address) readahead.
     848             :  */
     849           0 : struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
     850             :                                 struct vm_fault *vmf)
     851             : {
     852             :         return swap_use_vma_readahead() ?
     853           0 :                         swap_vma_readahead(entry, gfp_mask, vmf) :
     854             :                         swap_cluster_readahead(entry, gfp_mask, vmf);
     855             : }
     856             : 
     857             : #ifdef CONFIG_SYSFS
     858           0 : static ssize_t vma_ra_enabled_show(struct kobject *kobj,
     859             :                                      struct kobj_attribute *attr, char *buf)
     860             : {
     861           0 :         return sysfs_emit(buf, "%s\n",
     862           0 :                           enable_vma_readahead ? "true" : "false");
     863             : }
     864           0 : static ssize_t vma_ra_enabled_store(struct kobject *kobj,
     865             :                                       struct kobj_attribute *attr,
     866             :                                       const char *buf, size_t count)
     867             : {
     868           0 :         if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
     869           0 :                 enable_vma_readahead = true;
     870           0 :         else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
     871           0 :                 enable_vma_readahead = false;
     872             :         else
     873             :                 return -EINVAL;
     874             : 
     875           0 :         return count;
     876             : }
     877             : static struct kobj_attribute vma_ra_enabled_attr =
     878             :         __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
     879             :                vma_ra_enabled_store);
     880             : 
     881             : static struct attribute *swap_attrs[] = {
     882             :         &vma_ra_enabled_attr.attr,
     883             :         NULL,
     884             : };
     885             : 
     886             : static const struct attribute_group swap_attr_group = {
     887             :         .attrs = swap_attrs,
     888             : };
     889             : 
     890           1 : static int __init swap_init_sysfs(void)
     891             : {
     892             :         int err;
     893             :         struct kobject *swap_kobj;
     894             : 
     895           1 :         swap_kobj = kobject_create_and_add("swap", mm_kobj);
     896           1 :         if (!swap_kobj) {
     897           0 :                 pr_err("failed to create swap kobject\n");
     898           0 :                 return -ENOMEM;
     899             :         }
     900           1 :         err = sysfs_create_group(swap_kobj, &swap_attr_group);
     901           1 :         if (err) {
     902           0 :                 pr_err("failed to register swap group\n");
     903             :                 goto delete_obj;
     904             :         }
     905             :         return 0;
     906             : 
     907             : delete_obj:
     908           0 :         kobject_put(swap_kobj);
     909           0 :         return err;
     910             : }
     911             : subsys_initcall(swap_init_sysfs);
     912             : #endif

Generated by: LCOV version 1.14