Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * linux/mm/swap_state.c
4 : *
5 : * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 : * Swap reorganised 29.12.95, Stephen Tweedie
7 : *
8 : * Rewritten to use page cache, (C) 1998 Stephen Tweedie
9 : */
10 : #include <linux/mm.h>
11 : #include <linux/gfp.h>
12 : #include <linux/kernel_stat.h>
13 : #include <linux/swap.h>
14 : #include <linux/swapops.h>
15 : #include <linux/init.h>
16 : #include <linux/pagemap.h>
17 : #include <linux/backing-dev.h>
18 : #include <linux/blkdev.h>
19 : #include <linux/pagevec.h>
20 : #include <linux/migrate.h>
21 : #include <linux/vmalloc.h>
22 : #include <linux/swap_slots.h>
23 : #include <linux/huge_mm.h>
24 : #include <linux/shmem_fs.h>
25 : #include "internal.h"
26 :
27 : /*
28 : * swapper_space is a fiction, retained to simplify the path through
29 : * vmscan's shrink_page_list.
30 : */
31 : static const struct address_space_operations swap_aops = {
32 : .writepage = swap_writepage,
33 : .dirty_folio = swap_dirty_folio,
34 : #ifdef CONFIG_MIGRATION
35 : .migratepage = migrate_page,
36 : #endif
37 : };
38 :
39 : struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
40 : static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
41 : static bool enable_vma_readahead __read_mostly = true;
42 :
43 : #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
44 : #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
45 : #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
46 : #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
47 :
48 : #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
49 : #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
50 : #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
51 :
52 : #define SWAP_RA_VAL(addr, win, hits) \
53 : (((addr) & PAGE_MASK) | \
54 : (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
55 : ((hits) & SWAP_RA_HITS_MASK))
56 :
57 : /* Initial readahead hits is 4 to start up with a small window */
58 : #define GET_SWAP_RA_VAL(vma) \
59 : (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
60 :
61 : #define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
62 : #define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
63 :
64 : static struct {
65 : unsigned long add_total;
66 : unsigned long del_total;
67 : unsigned long find_success;
68 : unsigned long find_total;
69 : } swap_cache_info;
70 :
71 : static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
72 :
73 0 : void show_swap_cache_info(void)
74 : {
75 0 : printk("%lu pages in swap cache\n", total_swapcache_pages());
76 0 : printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
77 : swap_cache_info.add_total, swap_cache_info.del_total,
78 : swap_cache_info.find_success, swap_cache_info.find_total);
79 0 : printk("Free swap = %ldkB\n",
80 : get_nr_swap_pages() << (PAGE_SHIFT - 10));
81 0 : printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
82 0 : }
83 :
84 0 : void *get_shadow_from_swap_cache(swp_entry_t entry)
85 : {
86 0 : struct address_space *address_space = swap_address_space(entry);
87 0 : pgoff_t idx = swp_offset(entry);
88 : struct page *page;
89 :
90 0 : page = xa_load(&address_space->i_pages, idx);
91 0 : if (xa_is_value(page))
92 : return page;
93 0 : return NULL;
94 : }
95 :
96 : /*
97 : * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
98 : * but sets SwapCache flag and private instead of mapping and index.
99 : */
100 0 : int add_to_swap_cache(struct page *page, swp_entry_t entry,
101 : gfp_t gfp, void **shadowp)
102 : {
103 0 : struct address_space *address_space = swap_address_space(entry);
104 0 : pgoff_t idx = swp_offset(entry);
105 0 : XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
106 0 : unsigned long i, nr = thp_nr_pages(page);
107 : void *old;
108 :
109 : VM_BUG_ON_PAGE(!PageLocked(page), page);
110 : VM_BUG_ON_PAGE(PageSwapCache(page), page);
111 : VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
112 :
113 0 : page_ref_add(page, nr);
114 : SetPageSwapCache(page);
115 :
116 : do {
117 0 : xas_lock_irq(&xas);
118 0 : xas_create_range(&xas);
119 0 : if (xas_error(&xas))
120 : goto unlock;
121 0 : for (i = 0; i < nr; i++) {
122 : VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
123 0 : old = xas_load(&xas);
124 0 : if (xa_is_value(old)) {
125 0 : if (shadowp)
126 0 : *shadowp = old;
127 : }
128 0 : set_page_private(page + i, entry.val + i);
129 0 : xas_store(&xas, page);
130 0 : xas_next(&xas);
131 : }
132 0 : address_space->nrpages += nr;
133 0 : __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
134 0 : __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
135 0 : ADD_CACHE_INFO(add_total, nr);
136 : unlock:
137 0 : xas_unlock_irq(&xas);
138 0 : } while (xas_nomem(&xas, gfp));
139 :
140 0 : if (!xas_error(&xas))
141 : return 0;
142 :
143 0 : ClearPageSwapCache(page);
144 0 : page_ref_sub(page, nr);
145 0 : return xas_error(&xas);
146 : }
147 :
148 : /*
149 : * This must be called only on pages that have
150 : * been verified to be in the swap cache.
151 : */
152 0 : void __delete_from_swap_cache(struct page *page,
153 : swp_entry_t entry, void *shadow)
154 : {
155 0 : struct address_space *address_space = swap_address_space(entry);
156 0 : int i, nr = thp_nr_pages(page);
157 0 : pgoff_t idx = swp_offset(entry);
158 0 : XA_STATE(xas, &address_space->i_pages, idx);
159 :
160 : VM_BUG_ON_PAGE(!PageLocked(page), page);
161 : VM_BUG_ON_PAGE(!PageSwapCache(page), page);
162 : VM_BUG_ON_PAGE(PageWriteback(page), page);
163 :
164 0 : for (i = 0; i < nr; i++) {
165 0 : void *entry = xas_store(&xas, shadow);
166 : VM_BUG_ON_PAGE(entry != page, entry);
167 0 : set_page_private(page + i, 0);
168 0 : xas_next(&xas);
169 : }
170 0 : ClearPageSwapCache(page);
171 0 : address_space->nrpages -= nr;
172 0 : __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
173 0 : __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
174 0 : ADD_CACHE_INFO(del_total, nr);
175 0 : }
176 :
177 : /**
178 : * add_to_swap - allocate swap space for a page
179 : * @page: page we want to move to swap
180 : *
181 : * Allocate swap space for the page and add the page to the
182 : * swap cache. Caller needs to hold the page lock.
183 : */
184 0 : int add_to_swap(struct page *page)
185 : {
186 : swp_entry_t entry;
187 : int err;
188 :
189 : VM_BUG_ON_PAGE(!PageLocked(page), page);
190 : VM_BUG_ON_PAGE(!PageUptodate(page), page);
191 :
192 0 : entry = get_swap_page(page);
193 0 : if (!entry.val)
194 : return 0;
195 :
196 : /*
197 : * XArray node allocations from PF_MEMALLOC contexts could
198 : * completely exhaust the page allocator. __GFP_NOMEMALLOC
199 : * stops emergency reserves from being allocated.
200 : *
201 : * TODO: this could cause a theoretical memory reclaim
202 : * deadlock in the swap out path.
203 : */
204 : /*
205 : * Add it to the swap cache.
206 : */
207 0 : err = add_to_swap_cache(page, entry,
208 : __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
209 0 : if (err)
210 : /*
211 : * add_to_swap_cache() doesn't return -EEXIST, so we can safely
212 : * clear SWAP_HAS_CACHE flag.
213 : */
214 : goto fail;
215 : /*
216 : * Normally the page will be dirtied in unmap because its pte should be
217 : * dirty. A special case is MADV_FREE page. The page's pte could have
218 : * dirty bit cleared but the page's SwapBacked bit is still set because
219 : * clearing the dirty bit and SwapBacked bit has no lock protected. For
220 : * such page, unmap will not set dirty bit for it, so page reclaim will
221 : * not write the page out. This can cause data corruption when the page
222 : * is swap in later. Always setting the dirty bit for the page solves
223 : * the problem.
224 : */
225 0 : set_page_dirty(page);
226 :
227 0 : return 1;
228 :
229 : fail:
230 0 : put_swap_page(page, entry);
231 0 : return 0;
232 : }
233 :
234 : /*
235 : * This must be called only on pages that have
236 : * been verified to be in the swap cache and locked.
237 : * It will never put the page into the free list,
238 : * the caller has a reference on the page.
239 : */
240 0 : void delete_from_swap_cache(struct page *page)
241 : {
242 0 : swp_entry_t entry = { .val = page_private(page) };
243 0 : struct address_space *address_space = swap_address_space(entry);
244 :
245 0 : xa_lock_irq(&address_space->i_pages);
246 0 : __delete_from_swap_cache(page, entry, NULL);
247 0 : xa_unlock_irq(&address_space->i_pages);
248 :
249 0 : put_swap_page(page, entry);
250 0 : page_ref_sub(page, thp_nr_pages(page));
251 0 : }
252 :
253 0 : void clear_shadow_from_swap_cache(int type, unsigned long begin,
254 : unsigned long end)
255 : {
256 0 : unsigned long curr = begin;
257 : void *old;
258 :
259 0 : for (;;) {
260 0 : swp_entry_t entry = swp_entry(type, curr);
261 0 : struct address_space *address_space = swap_address_space(entry);
262 0 : XA_STATE(xas, &address_space->i_pages, curr);
263 :
264 0 : xa_lock_irq(&address_space->i_pages);
265 0 : xas_for_each(&xas, old, end) {
266 0 : if (!xa_is_value(old))
267 0 : continue;
268 0 : xas_store(&xas, NULL);
269 : }
270 0 : xa_unlock_irq(&address_space->i_pages);
271 :
272 : /* search the next swapcache until we meet end */
273 0 : curr >>= SWAP_ADDRESS_SPACE_SHIFT;
274 0 : curr++;
275 0 : curr <<= SWAP_ADDRESS_SPACE_SHIFT;
276 0 : if (curr > end)
277 : break;
278 : }
279 0 : }
280 :
281 : /*
282 : * If we are the only user, then try to free up the swap cache.
283 : *
284 : * Its ok to check for PageSwapCache without the page lock
285 : * here because we are going to recheck again inside
286 : * try_to_free_swap() _with_ the lock.
287 : * - Marcelo
288 : */
289 0 : void free_swap_cache(struct page *page)
290 : {
291 0 : if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
292 0 : try_to_free_swap(page);
293 0 : unlock_page(page);
294 : }
295 0 : }
296 :
297 : /*
298 : * Perform a free_page(), also freeing any swap cache associated with
299 : * this page if it is the last user of the page.
300 : */
301 0 : void free_page_and_swap_cache(struct page *page)
302 : {
303 0 : free_swap_cache(page);
304 0 : if (!is_huge_zero_page(page))
305 0 : put_page(page);
306 0 : }
307 :
308 : /*
309 : * Passed an array of pages, drop them all from swapcache and then release
310 : * them. They are removed from the LRU and freed if this is their last use.
311 : */
312 0 : void free_pages_and_swap_cache(struct page **pages, int nr)
313 : {
314 0 : struct page **pagep = pages;
315 : int i;
316 :
317 0 : lru_add_drain();
318 0 : for (i = 0; i < nr; i++)
319 0 : free_swap_cache(pagep[i]);
320 0 : release_pages(pagep, nr);
321 0 : }
322 :
323 : static inline bool swap_use_vma_readahead(void)
324 : {
325 0 : return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
326 : }
327 :
328 : /*
329 : * Lookup a swap entry in the swap cache. A found page will be returned
330 : * unlocked and with its refcount incremented - we rely on the kernel
331 : * lock getting page table operations atomic even if we drop the page
332 : * lock before returning.
333 : */
334 0 : struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
335 : unsigned long addr)
336 : {
337 : struct page *page;
338 : struct swap_info_struct *si;
339 :
340 0 : si = get_swap_device(entry);
341 0 : if (!si)
342 : return NULL;
343 0 : page = find_get_page(swap_address_space(entry), swp_offset(entry));
344 0 : put_swap_device(si);
345 :
346 0 : INC_CACHE_INFO(find_total);
347 0 : if (page) {
348 0 : bool vma_ra = swap_use_vma_readahead();
349 : bool readahead;
350 :
351 0 : INC_CACHE_INFO(find_success);
352 : /*
353 : * At the moment, we don't support PG_readahead for anon THP
354 : * so let's bail out rather than confusing the readahead stat.
355 : */
356 : if (unlikely(PageTransCompound(page)))
357 : return page;
358 :
359 0 : readahead = TestClearPageReadahead(page);
360 0 : if (vma && vma_ra) {
361 : unsigned long ra_val;
362 : int win, hits;
363 :
364 0 : ra_val = GET_SWAP_RA_VAL(vma);
365 0 : win = SWAP_RA_WIN(ra_val);
366 0 : hits = SWAP_RA_HITS(ra_val);
367 0 : if (readahead)
368 0 : hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
369 0 : atomic_long_set(&vma->swap_readahead_info,
370 0 : SWAP_RA_VAL(addr, win, hits));
371 : }
372 :
373 0 : if (readahead) {
374 0 : count_vm_event(SWAP_RA_HIT);
375 0 : if (!vma || !vma_ra)
376 : atomic_inc(&swapin_readahead_hits);
377 : }
378 : }
379 :
380 : return page;
381 : }
382 :
383 : /**
384 : * find_get_incore_page - Find and get a page from the page or swap caches.
385 : * @mapping: The address_space to search.
386 : * @index: The page cache index.
387 : *
388 : * This differs from find_get_page() in that it will also look for the
389 : * page in the swap cache.
390 : *
391 : * Return: The found page or %NULL.
392 : */
393 0 : struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
394 : {
395 : swp_entry_t swp;
396 : struct swap_info_struct *si;
397 0 : struct page *page = pagecache_get_page(mapping, index,
398 : FGP_ENTRY | FGP_HEAD, 0);
399 :
400 0 : if (!page)
401 : return page;
402 0 : if (!xa_is_value(page))
403 0 : return find_subpage(page, index);
404 0 : if (!shmem_mapping(mapping))
405 : return NULL;
406 :
407 0 : swp = radix_to_swp_entry(page);
408 : /* Prevent swapoff from happening to us */
409 0 : si = get_swap_device(swp);
410 0 : if (!si)
411 : return NULL;
412 0 : page = find_get_page(swap_address_space(swp), swp_offset(swp));
413 0 : put_swap_device(si);
414 0 : return page;
415 : }
416 :
417 0 : struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
418 : struct vm_area_struct *vma, unsigned long addr,
419 : bool *new_page_allocated)
420 : {
421 : struct swap_info_struct *si;
422 : struct page *page;
423 0 : void *shadow = NULL;
424 :
425 0 : *new_page_allocated = false;
426 :
427 0 : for (;;) {
428 : int err;
429 : /*
430 : * First check the swap cache. Since this is normally
431 : * called after lookup_swap_cache() failed, re-calling
432 : * that would confuse statistics.
433 : */
434 0 : si = get_swap_device(entry);
435 0 : if (!si)
436 : return NULL;
437 0 : page = find_get_page(swap_address_space(entry),
438 : swp_offset(entry));
439 0 : put_swap_device(si);
440 0 : if (page)
441 : return page;
442 :
443 : /*
444 : * Just skip read ahead for unused swap slot.
445 : * During swap_off when swap_slot_cache is disabled,
446 : * we have to handle the race between putting
447 : * swap entry in swap cache and marking swap slot
448 : * as SWAP_HAS_CACHE. That's done in later part of code or
449 : * else swap_off will be aborted if we return NULL.
450 : */
451 0 : if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
452 : return NULL;
453 :
454 : /*
455 : * Get a new page to read into from swap. Allocate it now,
456 : * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
457 : * cause any racers to loop around until we add it to cache.
458 : */
459 0 : page = alloc_page_vma(gfp_mask, vma, addr);
460 0 : if (!page)
461 : return NULL;
462 :
463 : /*
464 : * Swap entry may have been freed since our caller observed it.
465 : */
466 0 : err = swapcache_prepare(entry);
467 0 : if (!err)
468 : break;
469 :
470 0 : put_page(page);
471 0 : if (err != -EEXIST)
472 : return NULL;
473 :
474 : /*
475 : * We might race against __delete_from_swap_cache(), and
476 : * stumble across a swap_map entry whose SWAP_HAS_CACHE
477 : * has not yet been cleared. Or race against another
478 : * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
479 : * in swap_map, but not yet added its page to swap cache.
480 : */
481 0 : schedule_timeout_uninterruptible(1);
482 : }
483 :
484 : /*
485 : * The swap entry is ours to swap in. Prepare the new page.
486 : */
487 :
488 0 : __SetPageLocked(page);
489 0 : __SetPageSwapBacked(page);
490 :
491 0 : if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
492 : goto fail_unlock;
493 :
494 : /* May fail (-ENOMEM) if XArray node allocation failed. */
495 0 : if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
496 : goto fail_unlock;
497 :
498 0 : mem_cgroup_swapin_uncharge_swap(entry);
499 :
500 0 : if (shadow)
501 0 : workingset_refault(page_folio(page), shadow);
502 :
503 : /* Caller will initiate read into locked page */
504 0 : lru_cache_add(page);
505 0 : *new_page_allocated = true;
506 0 : return page;
507 :
508 : fail_unlock:
509 0 : put_swap_page(page, entry);
510 0 : unlock_page(page);
511 0 : put_page(page);
512 0 : return NULL;
513 : }
514 :
515 : /*
516 : * Locate a page of swap in physical memory, reserving swap cache space
517 : * and reading the disk if it is not already cached.
518 : * A failure return means that either the page allocation failed or that
519 : * the swap entry is no longer in use.
520 : */
521 0 : struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
522 : struct vm_area_struct *vma, unsigned long addr, bool do_poll)
523 : {
524 : bool page_was_allocated;
525 0 : struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
526 : vma, addr, &page_was_allocated);
527 :
528 0 : if (page_was_allocated)
529 0 : swap_readpage(retpage, do_poll);
530 :
531 0 : return retpage;
532 : }
533 :
534 : static unsigned int __swapin_nr_pages(unsigned long prev_offset,
535 : unsigned long offset,
536 : int hits,
537 : int max_pages,
538 : int prev_win)
539 : {
540 : unsigned int pages, last_ra;
541 :
542 : /*
543 : * This heuristic has been found to work well on both sequential and
544 : * random loads, swapping to hard disk or to SSD: please don't ask
545 : * what the "+ 2" means, it just happens to work well, that's all.
546 : */
547 0 : pages = hits + 2;
548 0 : if (pages == 2) {
549 : /*
550 : * We can have no readahead hits to judge by: but must not get
551 : * stuck here forever, so check for an adjacent offset instead
552 : * (and don't even bother to check whether swap type is same).
553 : */
554 0 : if (offset != prev_offset + 1 && offset != prev_offset - 1)
555 0 : pages = 1;
556 : } else {
557 : unsigned int roundup = 4;
558 0 : while (roundup < pages)
559 0 : roundup <<= 1;
560 : pages = roundup;
561 : }
562 :
563 0 : if (pages > max_pages)
564 0 : pages = max_pages;
565 :
566 : /* Don't shrink readahead too fast */
567 0 : last_ra = prev_win / 2;
568 0 : if (pages < last_ra)
569 0 : pages = last_ra;
570 :
571 : return pages;
572 : }
573 :
574 0 : static unsigned long swapin_nr_pages(unsigned long offset)
575 : {
576 : static unsigned long prev_offset;
577 : unsigned int hits, pages, max_pages;
578 : static atomic_t last_readahead_pages;
579 :
580 0 : max_pages = 1 << READ_ONCE(page_cluster);
581 0 : if (max_pages <= 1)
582 : return 1;
583 :
584 0 : hits = atomic_xchg(&swapin_readahead_hits, 0);
585 0 : pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
586 : max_pages,
587 : atomic_read(&last_readahead_pages));
588 0 : if (!hits)
589 0 : WRITE_ONCE(prev_offset, offset);
590 0 : atomic_set(&last_readahead_pages, pages);
591 :
592 0 : return pages;
593 : }
594 :
595 : /**
596 : * swap_cluster_readahead - swap in pages in hope we need them soon
597 : * @entry: swap entry of this memory
598 : * @gfp_mask: memory allocation flags
599 : * @vmf: fault information
600 : *
601 : * Returns the struct page for entry and addr, after queueing swapin.
602 : *
603 : * Primitive swap readahead code. We simply read an aligned block of
604 : * (1 << page_cluster) entries in the swap area. This method is chosen
605 : * because it doesn't cost us any seek time. We also make sure to queue
606 : * the 'original' request together with the readahead ones...
607 : *
608 : * This has been extended to use the NUMA policies from the mm triggering
609 : * the readahead.
610 : *
611 : * Caller must hold read mmap_lock if vmf->vma is not NULL.
612 : */
613 0 : struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
614 : struct vm_fault *vmf)
615 : {
616 : struct page *page;
617 0 : unsigned long entry_offset = swp_offset(entry);
618 0 : unsigned long offset = entry_offset;
619 : unsigned long start_offset, end_offset;
620 : unsigned long mask;
621 0 : struct swap_info_struct *si = swp_swap_info(entry);
622 : struct blk_plug plug;
623 0 : bool do_poll = true, page_allocated;
624 0 : struct vm_area_struct *vma = vmf->vma;
625 0 : unsigned long addr = vmf->address;
626 :
627 0 : mask = swapin_nr_pages(offset) - 1;
628 0 : if (!mask)
629 : goto skip;
630 :
631 0 : do_poll = false;
632 : /* Read a page_cluster sized and aligned cluster around offset. */
633 0 : start_offset = offset & ~mask;
634 0 : end_offset = offset | mask;
635 0 : if (!start_offset) /* First page is swap header. */
636 0 : start_offset++;
637 0 : if (end_offset >= si->max)
638 0 : end_offset = si->max - 1;
639 :
640 0 : blk_start_plug(&plug);
641 0 : for (offset = start_offset; offset <= end_offset ; offset++) {
642 : /* Ok, do the async read-ahead now */
643 0 : page = __read_swap_cache_async(
644 : swp_entry(swp_type(entry), offset),
645 : gfp_mask, vma, addr, &page_allocated);
646 0 : if (!page)
647 0 : continue;
648 0 : if (page_allocated) {
649 0 : swap_readpage(page, false);
650 0 : if (offset != entry_offset) {
651 0 : SetPageReadahead(page);
652 0 : count_vm_event(SWAP_RA);
653 : }
654 : }
655 0 : put_page(page);
656 : }
657 0 : blk_finish_plug(&plug);
658 :
659 0 : lru_add_drain(); /* Push any new pages onto the LRU now */
660 : skip:
661 0 : return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
662 : }
663 :
664 0 : int init_swap_address_space(unsigned int type, unsigned long nr_pages)
665 : {
666 : struct address_space *spaces, *space;
667 : unsigned int i, nr;
668 :
669 0 : nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
670 0 : spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
671 0 : if (!spaces)
672 : return -ENOMEM;
673 0 : for (i = 0; i < nr; i++) {
674 0 : space = spaces + i;
675 0 : xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
676 0 : atomic_set(&space->i_mmap_writable, 0);
677 0 : space->a_ops = &swap_aops;
678 : /* swap cache doesn't use writeback related tags */
679 0 : mapping_set_no_writeback_tags(space);
680 : }
681 0 : nr_swapper_spaces[type] = nr;
682 0 : swapper_spaces[type] = spaces;
683 :
684 0 : return 0;
685 : }
686 :
687 0 : void exit_swap_address_space(unsigned int type)
688 : {
689 : int i;
690 0 : struct address_space *spaces = swapper_spaces[type];
691 :
692 0 : for (i = 0; i < nr_swapper_spaces[type]; i++)
693 : VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
694 0 : kvfree(spaces);
695 0 : nr_swapper_spaces[type] = 0;
696 0 : swapper_spaces[type] = NULL;
697 0 : }
698 :
699 : static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
700 : unsigned long faddr,
701 : unsigned long lpfn,
702 : unsigned long rpfn,
703 : unsigned long *start,
704 : unsigned long *end)
705 : {
706 0 : *start = max3(lpfn, PFN_DOWN(vma->vm_start),
707 : PFN_DOWN(faddr & PMD_MASK));
708 0 : *end = min3(rpfn, PFN_DOWN(vma->vm_end),
709 : PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
710 : }
711 :
712 0 : static void swap_ra_info(struct vm_fault *vmf,
713 : struct vma_swap_readahead *ra_info)
714 : {
715 0 : struct vm_area_struct *vma = vmf->vma;
716 : unsigned long ra_val;
717 : unsigned long faddr, pfn, fpfn;
718 : unsigned long start, end;
719 : pte_t *pte, *orig_pte;
720 : unsigned int max_win, hits, prev_win, win, left;
721 : #ifndef CONFIG_64BIT
722 : pte_t *tpte;
723 : #endif
724 :
725 0 : max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
726 : SWAP_RA_ORDER_CEILING);
727 0 : if (max_win == 1) {
728 0 : ra_info->win = 1;
729 0 : return;
730 : }
731 :
732 0 : faddr = vmf->address;
733 0 : orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
734 :
735 0 : fpfn = PFN_DOWN(faddr);
736 0 : ra_val = GET_SWAP_RA_VAL(vma);
737 0 : pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
738 0 : prev_win = SWAP_RA_WIN(ra_val);
739 0 : hits = SWAP_RA_HITS(ra_val);
740 0 : ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
741 : max_win, prev_win);
742 0 : atomic_long_set(&vma->swap_readahead_info,
743 0 : SWAP_RA_VAL(faddr, win, 0));
744 :
745 0 : if (win == 1) {
746 : pte_unmap(orig_pte);
747 : return;
748 : }
749 :
750 : /* Copy the PTEs because the page table may be unmapped */
751 0 : if (fpfn == pfn + 1)
752 0 : swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
753 0 : else if (pfn == fpfn + 1)
754 0 : swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
755 : &start, &end);
756 : else {
757 0 : left = (win - 1) / 2;
758 0 : swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
759 : &start, &end);
760 : }
761 0 : ra_info->nr_pte = end - start;
762 0 : ra_info->offset = fpfn - start;
763 0 : pte -= ra_info->offset;
764 : #ifdef CONFIG_64BIT
765 0 : ra_info->ptes = pte;
766 : #else
767 : tpte = ra_info->ptes;
768 : for (pfn = start; pfn != end; pfn++)
769 : *tpte++ = *pte++;
770 : #endif
771 : pte_unmap(orig_pte);
772 : }
773 :
774 : /**
775 : * swap_vma_readahead - swap in pages in hope we need them soon
776 : * @fentry: swap entry of this memory
777 : * @gfp_mask: memory allocation flags
778 : * @vmf: fault information
779 : *
780 : * Returns the struct page for entry and addr, after queueing swapin.
781 : *
782 : * Primitive swap readahead code. We simply read in a few pages whose
783 : * virtual addresses are around the fault address in the same vma.
784 : *
785 : * Caller must hold read mmap_lock if vmf->vma is not NULL.
786 : *
787 : */
788 0 : static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
789 : struct vm_fault *vmf)
790 : {
791 : struct blk_plug plug;
792 0 : struct vm_area_struct *vma = vmf->vma;
793 : struct page *page;
794 : pte_t *pte, pentry;
795 : swp_entry_t entry;
796 : unsigned int i;
797 : bool page_allocated;
798 0 : struct vma_swap_readahead ra_info = {
799 : .win = 1,
800 : };
801 :
802 0 : swap_ra_info(vmf, &ra_info);
803 0 : if (ra_info.win == 1)
804 : goto skip;
805 :
806 0 : blk_start_plug(&plug);
807 0 : for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
808 0 : i++, pte++) {
809 0 : pentry = *pte;
810 0 : if (pte_none(pentry))
811 0 : continue;
812 0 : if (pte_present(pentry))
813 0 : continue;
814 0 : entry = pte_to_swp_entry(pentry);
815 0 : if (unlikely(non_swap_entry(entry)))
816 0 : continue;
817 0 : page = __read_swap_cache_async(entry, gfp_mask, vma,
818 : vmf->address, &page_allocated);
819 0 : if (!page)
820 0 : continue;
821 0 : if (page_allocated) {
822 0 : swap_readpage(page, false);
823 0 : if (i != ra_info.offset) {
824 0 : SetPageReadahead(page);
825 0 : count_vm_event(SWAP_RA);
826 : }
827 : }
828 0 : put_page(page);
829 : }
830 0 : blk_finish_plug(&plug);
831 0 : lru_add_drain();
832 : skip:
833 0 : return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
834 0 : ra_info.win == 1);
835 : }
836 :
837 : /**
838 : * swapin_readahead - swap in pages in hope we need them soon
839 : * @entry: swap entry of this memory
840 : * @gfp_mask: memory allocation flags
841 : * @vmf: fault information
842 : *
843 : * Returns the struct page for entry and addr, after queueing swapin.
844 : *
845 : * It's a main entry function for swap readahead. By the configuration,
846 : * it will read ahead blocks by cluster-based(ie, physical disk based)
847 : * or vma-based(ie, virtual address based on faulty address) readahead.
848 : */
849 0 : struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
850 : struct vm_fault *vmf)
851 : {
852 : return swap_use_vma_readahead() ?
853 0 : swap_vma_readahead(entry, gfp_mask, vmf) :
854 : swap_cluster_readahead(entry, gfp_mask, vmf);
855 : }
856 :
857 : #ifdef CONFIG_SYSFS
858 0 : static ssize_t vma_ra_enabled_show(struct kobject *kobj,
859 : struct kobj_attribute *attr, char *buf)
860 : {
861 0 : return sysfs_emit(buf, "%s\n",
862 0 : enable_vma_readahead ? "true" : "false");
863 : }
864 0 : static ssize_t vma_ra_enabled_store(struct kobject *kobj,
865 : struct kobj_attribute *attr,
866 : const char *buf, size_t count)
867 : {
868 0 : if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
869 0 : enable_vma_readahead = true;
870 0 : else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
871 0 : enable_vma_readahead = false;
872 : else
873 : return -EINVAL;
874 :
875 0 : return count;
876 : }
877 : static struct kobj_attribute vma_ra_enabled_attr =
878 : __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
879 : vma_ra_enabled_store);
880 :
881 : static struct attribute *swap_attrs[] = {
882 : &vma_ra_enabled_attr.attr,
883 : NULL,
884 : };
885 :
886 : static const struct attribute_group swap_attr_group = {
887 : .attrs = swap_attrs,
888 : };
889 :
890 1 : static int __init swap_init_sysfs(void)
891 : {
892 : int err;
893 : struct kobject *swap_kobj;
894 :
895 1 : swap_kobj = kobject_create_and_add("swap", mm_kobj);
896 1 : if (!swap_kobj) {
897 0 : pr_err("failed to create swap kobject\n");
898 0 : return -ENOMEM;
899 : }
900 1 : err = sysfs_create_group(swap_kobj, &swap_attr_group);
901 1 : if (err) {
902 0 : pr_err("failed to register swap group\n");
903 : goto delete_obj;
904 : }
905 : return 0;
906 :
907 : delete_obj:
908 0 : kobject_put(swap_kobj);
909 0 : return err;
910 : }
911 : subsys_initcall(swap_init_sysfs);
912 : #endif
|