LCOV - code coverage report
Current view: top level - mm - rmap.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 4 506 0.8 %
Date: 2022-12-09 01:23:36 Functions: 1 39 2.6 %

          Line data    Source code
       1             : /*
       2             :  * mm/rmap.c - physical to virtual reverse mappings
       3             :  *
       4             :  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
       5             :  * Released under the General Public License (GPL).
       6             :  *
       7             :  * Simple, low overhead reverse mapping scheme.
       8             :  * Please try to keep this thing as modular as possible.
       9             :  *
      10             :  * Provides methods for unmapping each kind of mapped page:
      11             :  * the anon methods track anonymous pages, and
      12             :  * the file methods track pages belonging to an inode.
      13             :  *
      14             :  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
      15             :  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
      16             :  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
      17             :  * Contributions by Hugh Dickins 2003, 2004
      18             :  */
      19             : 
      20             : /*
      21             :  * Lock ordering in mm:
      22             :  *
      23             :  * inode->i_rwsem    (while writing or truncating, not reading or faulting)
      24             :  *   mm->mmap_lock
      25             :  *     mapping->invalidate_lock (in filemap_fault)
      26             :  *       page->flags PG_locked (lock_page)   * (see hugetlbfs below)
      27             :  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
      28             :  *           mapping->i_mmap_rwsem
      29             :  *             hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
      30             :  *             anon_vma->rwsem
      31             :  *               mm->page_table_lock or pte_lock
      32             :  *                 swap_lock (in swap_duplicate, swap_info_get)
      33             :  *                   mmlist_lock (in mmput, drain_mmlist and others)
      34             :  *                   mapping->private_lock (in block_dirty_folio)
      35             :  *                     folio_lock_memcg move_lock (in block_dirty_folio)
      36             :  *                       i_pages lock (widely used)
      37             :  *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
      38             :  *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
      39             :  *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
      40             :  *                     sb_lock (within inode_lock in fs/fs-writeback.c)
      41             :  *                     i_pages lock (widely used, in set_page_dirty,
      42             :  *                               in arch-dependent flush_dcache_mmap_lock,
      43             :  *                               within bdi.wb->list_lock in __sync_single_inode)
      44             :  *
      45             :  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
      46             :  *   ->tasklist_lock
      47             :  *     pte map lock
      48             :  *
      49             :  * * hugetlbfs PageHuge() pages take locks in this order:
      50             :  *         mapping->i_mmap_rwsem
      51             :  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
      52             :  *             page->flags PG_locked (lock_page)
      53             :  */
      54             : 
      55             : #include <linux/mm.h>
      56             : #include <linux/sched/mm.h>
      57             : #include <linux/sched/task.h>
      58             : #include <linux/pagemap.h>
      59             : #include <linux/swap.h>
      60             : #include <linux/swapops.h>
      61             : #include <linux/slab.h>
      62             : #include <linux/init.h>
      63             : #include <linux/ksm.h>
      64             : #include <linux/rmap.h>
      65             : #include <linux/rcupdate.h>
      66             : #include <linux/export.h>
      67             : #include <linux/memcontrol.h>
      68             : #include <linux/mmu_notifier.h>
      69             : #include <linux/migrate.h>
      70             : #include <linux/hugetlb.h>
      71             : #include <linux/huge_mm.h>
      72             : #include <linux/backing-dev.h>
      73             : #include <linux/page_idle.h>
      74             : #include <linux/memremap.h>
      75             : #include <linux/userfaultfd_k.h>
      76             : 
      77             : #include <asm/tlbflush.h>
      78             : 
      79             : #define CREATE_TRACE_POINTS
      80             : #include <trace/events/tlb.h>
      81             : #include <trace/events/migrate.h>
      82             : 
      83             : #include "internal.h"
      84             : 
      85             : static struct kmem_cache *anon_vma_cachep;
      86             : static struct kmem_cache *anon_vma_chain_cachep;
      87             : 
      88           0 : static inline struct anon_vma *anon_vma_alloc(void)
      89             : {
      90             :         struct anon_vma *anon_vma;
      91             : 
      92           0 :         anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
      93           0 :         if (anon_vma) {
      94           0 :                 atomic_set(&anon_vma->refcount, 1);
      95           0 :                 anon_vma->degree = 1;        /* Reference for first vma */
      96           0 :                 anon_vma->parent = anon_vma;
      97             :                 /*
      98             :                  * Initialise the anon_vma root to point to itself. If called
      99             :                  * from fork, the root will be reset to the parents anon_vma.
     100             :                  */
     101           0 :                 anon_vma->root = anon_vma;
     102             :         }
     103             : 
     104           0 :         return anon_vma;
     105             : }
     106             : 
     107           0 : static inline void anon_vma_free(struct anon_vma *anon_vma)
     108             : {
     109             :         VM_BUG_ON(atomic_read(&anon_vma->refcount));
     110             : 
     111             :         /*
     112             :          * Synchronize against folio_lock_anon_vma_read() such that
     113             :          * we can safely hold the lock without the anon_vma getting
     114             :          * freed.
     115             :          *
     116             :          * Relies on the full mb implied by the atomic_dec_and_test() from
     117             :          * put_anon_vma() against the acquire barrier implied by
     118             :          * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
     119             :          *
     120             :          * folio_lock_anon_vma_read()   VS      put_anon_vma()
     121             :          *   down_read_trylock()                  atomic_dec_and_test()
     122             :          *   LOCK                                 MB
     123             :          *   atomic_read()                        rwsem_is_locked()
     124             :          *
     125             :          * LOCK should suffice since the actual taking of the lock must
     126             :          * happen _before_ what follows.
     127             :          */
     128             :         might_sleep();
     129           0 :         if (rwsem_is_locked(&anon_vma->root->rwsem)) {
     130           0 :                 anon_vma_lock_write(anon_vma);
     131           0 :                 anon_vma_unlock_write(anon_vma);
     132             :         }
     133             : 
     134           0 :         kmem_cache_free(anon_vma_cachep, anon_vma);
     135           0 : }
     136             : 
     137             : static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
     138             : {
     139           0 :         return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
     140             : }
     141             : 
     142             : static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
     143             : {
     144           0 :         kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
     145             : }
     146             : 
     147             : static void anon_vma_chain_link(struct vm_area_struct *vma,
     148             :                                 struct anon_vma_chain *avc,
     149             :                                 struct anon_vma *anon_vma)
     150             : {
     151           0 :         avc->vma = vma;
     152           0 :         avc->anon_vma = anon_vma;
     153           0 :         list_add(&avc->same_vma, &vma->anon_vma_chain);
     154           0 :         anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
     155             : }
     156             : 
     157             : /**
     158             :  * __anon_vma_prepare - attach an anon_vma to a memory region
     159             :  * @vma: the memory region in question
     160             :  *
     161             :  * This makes sure the memory mapping described by 'vma' has
     162             :  * an 'anon_vma' attached to it, so that we can associate the
     163             :  * anonymous pages mapped into it with that anon_vma.
     164             :  *
     165             :  * The common case will be that we already have one, which
     166             :  * is handled inline by anon_vma_prepare(). But if
     167             :  * not we either need to find an adjacent mapping that we
     168             :  * can re-use the anon_vma from (very common when the only
     169             :  * reason for splitting a vma has been mprotect()), or we
     170             :  * allocate a new one.
     171             :  *
     172             :  * Anon-vma allocations are very subtle, because we may have
     173             :  * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
     174             :  * and that may actually touch the rwsem even in the newly
     175             :  * allocated vma (it depends on RCU to make sure that the
     176             :  * anon_vma isn't actually destroyed).
     177             :  *
     178             :  * As a result, we need to do proper anon_vma locking even
     179             :  * for the new allocation. At the same time, we do not want
     180             :  * to do any locking for the common case of already having
     181             :  * an anon_vma.
     182             :  *
     183             :  * This must be called with the mmap_lock held for reading.
     184             :  */
     185           0 : int __anon_vma_prepare(struct vm_area_struct *vma)
     186             : {
     187           0 :         struct mm_struct *mm = vma->vm_mm;
     188             :         struct anon_vma *anon_vma, *allocated;
     189             :         struct anon_vma_chain *avc;
     190             : 
     191             :         might_sleep();
     192             : 
     193           0 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     194           0 :         if (!avc)
     195             :                 goto out_enomem;
     196             : 
     197           0 :         anon_vma = find_mergeable_anon_vma(vma);
     198           0 :         allocated = NULL;
     199           0 :         if (!anon_vma) {
     200           0 :                 anon_vma = anon_vma_alloc();
     201           0 :                 if (unlikely(!anon_vma))
     202             :                         goto out_enomem_free_avc;
     203             :                 allocated = anon_vma;
     204             :         }
     205             : 
     206           0 :         anon_vma_lock_write(anon_vma);
     207             :         /* page_table_lock to protect against threads */
     208           0 :         spin_lock(&mm->page_table_lock);
     209           0 :         if (likely(!vma->anon_vma)) {
     210           0 :                 vma->anon_vma = anon_vma;
     211           0 :                 anon_vma_chain_link(vma, avc, anon_vma);
     212             :                 /* vma reference or self-parent link for new root */
     213           0 :                 anon_vma->degree++;
     214           0 :                 allocated = NULL;
     215           0 :                 avc = NULL;
     216             :         }
     217           0 :         spin_unlock(&mm->page_table_lock);
     218           0 :         anon_vma_unlock_write(anon_vma);
     219             : 
     220           0 :         if (unlikely(allocated))
     221             :                 put_anon_vma(allocated);
     222           0 :         if (unlikely(avc))
     223             :                 anon_vma_chain_free(avc);
     224             : 
     225             :         return 0;
     226             : 
     227             :  out_enomem_free_avc:
     228             :         anon_vma_chain_free(avc);
     229             :  out_enomem:
     230             :         return -ENOMEM;
     231             : }
     232             : 
     233             : /*
     234             :  * This is a useful helper function for locking the anon_vma root as
     235             :  * we traverse the vma->anon_vma_chain, looping over anon_vma's that
     236             :  * have the same vma.
     237             :  *
     238             :  * Such anon_vma's should have the same root, so you'd expect to see
     239             :  * just a single mutex_lock for the whole traversal.
     240             :  */
     241           0 : static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
     242             : {
     243           0 :         struct anon_vma *new_root = anon_vma->root;
     244           0 :         if (new_root != root) {
     245           0 :                 if (WARN_ON_ONCE(root))
     246           0 :                         up_write(&root->rwsem);
     247           0 :                 root = new_root;
     248           0 :                 down_write(&root->rwsem);
     249             :         }
     250           0 :         return root;
     251             : }
     252             : 
     253             : static inline void unlock_anon_vma_root(struct anon_vma *root)
     254             : {
     255           0 :         if (root)
     256           0 :                 up_write(&root->rwsem);
     257             : }
     258             : 
     259             : /*
     260             :  * Attach the anon_vmas from src to dst.
     261             :  * Returns 0 on success, -ENOMEM on failure.
     262             :  *
     263             :  * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
     264             :  * anon_vma_fork(). The first three want an exact copy of src, while the last
     265             :  * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
     266             :  * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
     267             :  * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
     268             :  *
     269             :  * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
     270             :  * and reuse existing anon_vma which has no vmas and only one child anon_vma.
     271             :  * This prevents degradation of anon_vma hierarchy to endless linear chain in
     272             :  * case of constantly forking task. On the other hand, an anon_vma with more
     273             :  * than one child isn't reused even if there was no alive vma, thus rmap
     274             :  * walker has a good chance of avoiding scanning the whole hierarchy when it
     275             :  * searches where page is mapped.
     276             :  */
     277           0 : int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
     278             : {
     279             :         struct anon_vma_chain *avc, *pavc;
     280           0 :         struct anon_vma *root = NULL;
     281             : 
     282           0 :         list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
     283             :                 struct anon_vma *anon_vma;
     284             : 
     285           0 :                 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
     286           0 :                 if (unlikely(!avc)) {
     287           0 :                         unlock_anon_vma_root(root);
     288           0 :                         root = NULL;
     289           0 :                         avc = anon_vma_chain_alloc(GFP_KERNEL);
     290           0 :                         if (!avc)
     291             :                                 goto enomem_failure;
     292             :                 }
     293           0 :                 anon_vma = pavc->anon_vma;
     294           0 :                 root = lock_anon_vma_root(root, anon_vma);
     295           0 :                 anon_vma_chain_link(dst, avc, anon_vma);
     296             : 
     297             :                 /*
     298             :                  * Reuse existing anon_vma if its degree lower than two,
     299             :                  * that means it has no vma and only one anon_vma child.
     300             :                  *
     301             :                  * Do not chose parent anon_vma, otherwise first child
     302             :                  * will always reuse it. Root anon_vma is never reused:
     303             :                  * it has self-parent reference and at least one child.
     304             :                  */
     305           0 :                 if (!dst->anon_vma && src->anon_vma &&
     306           0 :                     anon_vma != src->anon_vma && anon_vma->degree < 2)
     307           0 :                         dst->anon_vma = anon_vma;
     308             :         }
     309           0 :         if (dst->anon_vma)
     310           0 :                 dst->anon_vma->degree++;
     311             :         unlock_anon_vma_root(root);
     312             :         return 0;
     313             : 
     314             :  enomem_failure:
     315             :         /*
     316             :          * dst->anon_vma is dropped here otherwise its degree can be incorrectly
     317             :          * decremented in unlink_anon_vmas().
     318             :          * We can safely do this because callers of anon_vma_clone() don't care
     319             :          * about dst->anon_vma if anon_vma_clone() failed.
     320             :          */
     321           0 :         dst->anon_vma = NULL;
     322           0 :         unlink_anon_vmas(dst);
     323           0 :         return -ENOMEM;
     324             : }
     325             : 
     326             : /*
     327             :  * Attach vma to its own anon_vma, as well as to the anon_vmas that
     328             :  * the corresponding VMA in the parent process is attached to.
     329             :  * Returns 0 on success, non-zero on failure.
     330             :  */
     331           0 : int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
     332             : {
     333             :         struct anon_vma_chain *avc;
     334             :         struct anon_vma *anon_vma;
     335             :         int error;
     336             : 
     337             :         /* Don't bother if the parent process has no anon_vma here. */
     338           0 :         if (!pvma->anon_vma)
     339             :                 return 0;
     340             : 
     341             :         /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
     342           0 :         vma->anon_vma = NULL;
     343             : 
     344             :         /*
     345             :          * First, attach the new VMA to the parent VMA's anon_vmas,
     346             :          * so rmap can find non-COWed pages in child processes.
     347             :          */
     348           0 :         error = anon_vma_clone(vma, pvma);
     349           0 :         if (error)
     350             :                 return error;
     351             : 
     352             :         /* An existing anon_vma has been reused, all done then. */
     353           0 :         if (vma->anon_vma)
     354             :                 return 0;
     355             : 
     356             :         /* Then add our own anon_vma. */
     357           0 :         anon_vma = anon_vma_alloc();
     358           0 :         if (!anon_vma)
     359             :                 goto out_error;
     360           0 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     361           0 :         if (!avc)
     362             :                 goto out_error_free_anon_vma;
     363             : 
     364             :         /*
     365             :          * The root anon_vma's rwsem is the lock actually used when we
     366             :          * lock any of the anon_vmas in this anon_vma tree.
     367             :          */
     368           0 :         anon_vma->root = pvma->anon_vma->root;
     369           0 :         anon_vma->parent = pvma->anon_vma;
     370             :         /*
     371             :          * With refcounts, an anon_vma can stay around longer than the
     372             :          * process it belongs to. The root anon_vma needs to be pinned until
     373             :          * this anon_vma is freed, because the lock lives in the root.
     374             :          */
     375           0 :         get_anon_vma(anon_vma->root);
     376             :         /* Mark this anon_vma as the one where our new (COWed) pages go. */
     377           0 :         vma->anon_vma = anon_vma;
     378           0 :         anon_vma_lock_write(anon_vma);
     379           0 :         anon_vma_chain_link(vma, avc, anon_vma);
     380           0 :         anon_vma->parent->degree++;
     381           0 :         anon_vma_unlock_write(anon_vma);
     382             : 
     383           0 :         return 0;
     384             : 
     385             :  out_error_free_anon_vma:
     386             :         put_anon_vma(anon_vma);
     387             :  out_error:
     388           0 :         unlink_anon_vmas(vma);
     389           0 :         return -ENOMEM;
     390             : }
     391             : 
     392           0 : void unlink_anon_vmas(struct vm_area_struct *vma)
     393             : {
     394             :         struct anon_vma_chain *avc, *next;
     395           0 :         struct anon_vma *root = NULL;
     396             : 
     397             :         /*
     398             :          * Unlink each anon_vma chained to the VMA.  This list is ordered
     399             :          * from newest to oldest, ensuring the root anon_vma gets freed last.
     400             :          */
     401           0 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     402           0 :                 struct anon_vma *anon_vma = avc->anon_vma;
     403             : 
     404           0 :                 root = lock_anon_vma_root(root, anon_vma);
     405           0 :                 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
     406             : 
     407             :                 /*
     408             :                  * Leave empty anon_vmas on the list - we'll need
     409             :                  * to free them outside the lock.
     410             :                  */
     411           0 :                 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
     412           0 :                         anon_vma->parent->degree--;
     413           0 :                         continue;
     414             :                 }
     415             : 
     416           0 :                 list_del(&avc->same_vma);
     417             :                 anon_vma_chain_free(avc);
     418             :         }
     419           0 :         if (vma->anon_vma) {
     420           0 :                 vma->anon_vma->degree--;
     421             : 
     422             :                 /*
     423             :                  * vma would still be needed after unlink, and anon_vma will be prepared
     424             :                  * when handle fault.
     425             :                  */
     426           0 :                 vma->anon_vma = NULL;
     427             :         }
     428           0 :         unlock_anon_vma_root(root);
     429             : 
     430             :         /*
     431             :          * Iterate the list once more, it now only contains empty and unlinked
     432             :          * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
     433             :          * needing to write-acquire the anon_vma->root->rwsem.
     434             :          */
     435           0 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     436           0 :                 struct anon_vma *anon_vma = avc->anon_vma;
     437             : 
     438             :                 VM_WARN_ON(anon_vma->degree);
     439           0 :                 put_anon_vma(anon_vma);
     440             : 
     441           0 :                 list_del(&avc->same_vma);
     442           0 :                 anon_vma_chain_free(avc);
     443             :         }
     444           0 : }
     445             : 
     446           0 : static void anon_vma_ctor(void *data)
     447             : {
     448           0 :         struct anon_vma *anon_vma = data;
     449             : 
     450           0 :         init_rwsem(&anon_vma->rwsem);
     451           0 :         atomic_set(&anon_vma->refcount, 0);
     452           0 :         anon_vma->rb_root = RB_ROOT_CACHED;
     453           0 : }
     454             : 
     455           1 : void __init anon_vma_init(void)
     456             : {
     457           1 :         anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
     458             :                         0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
     459             :                         anon_vma_ctor);
     460           1 :         anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
     461             :                         SLAB_PANIC|SLAB_ACCOUNT);
     462           1 : }
     463             : 
     464             : /*
     465             :  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
     466             :  *
     467             :  * Since there is no serialization what so ever against page_remove_rmap()
     468             :  * the best this function can do is return a refcount increased anon_vma
     469             :  * that might have been relevant to this page.
     470             :  *
     471             :  * The page might have been remapped to a different anon_vma or the anon_vma
     472             :  * returned may already be freed (and even reused).
     473             :  *
     474             :  * In case it was remapped to a different anon_vma, the new anon_vma will be a
     475             :  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
     476             :  * ensure that any anon_vma obtained from the page will still be valid for as
     477             :  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
     478             :  *
     479             :  * All users of this function must be very careful when walking the anon_vma
     480             :  * chain and verify that the page in question is indeed mapped in it
     481             :  * [ something equivalent to page_mapped_in_vma() ].
     482             :  *
     483             :  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
     484             :  * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
     485             :  * if there is a mapcount, we can dereference the anon_vma after observing
     486             :  * those.
     487             :  */
     488           0 : struct anon_vma *page_get_anon_vma(struct page *page)
     489             : {
     490           0 :         struct anon_vma *anon_vma = NULL;
     491             :         unsigned long anon_mapping;
     492             : 
     493             :         rcu_read_lock();
     494           0 :         anon_mapping = (unsigned long)READ_ONCE(page->mapping);
     495           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     496             :                 goto out;
     497           0 :         if (!page_mapped(page))
     498             :                 goto out;
     499             : 
     500           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     501           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     502             :                 anon_vma = NULL;
     503             :                 goto out;
     504             :         }
     505             : 
     506             :         /*
     507             :          * If this page is still mapped, then its anon_vma cannot have been
     508             :          * freed.  But if it has been unmapped, we have no security against the
     509             :          * anon_vma structure being freed and reused (for another anon_vma:
     510             :          * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
     511             :          * above cannot corrupt).
     512             :          */
     513           0 :         if (!page_mapped(page)) {
     514           0 :                 rcu_read_unlock();
     515             :                 put_anon_vma(anon_vma);
     516             :                 return NULL;
     517             :         }
     518             : out:
     519             :         rcu_read_unlock();
     520             : 
     521           0 :         return anon_vma;
     522             : }
     523             : 
     524             : /*
     525             :  * Similar to page_get_anon_vma() except it locks the anon_vma.
     526             :  *
     527             :  * Its a little more complex as it tries to keep the fast path to a single
     528             :  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
     529             :  * reference like with page_get_anon_vma() and then block on the mutex.
     530             :  */
     531           0 : struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
     532             : {
     533           0 :         struct anon_vma *anon_vma = NULL;
     534             :         struct anon_vma *root_anon_vma;
     535             :         unsigned long anon_mapping;
     536             : 
     537             :         rcu_read_lock();
     538           0 :         anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
     539           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     540             :                 goto out;
     541           0 :         if (!folio_mapped(folio))
     542             :                 goto out;
     543             : 
     544           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     545           0 :         root_anon_vma = READ_ONCE(anon_vma->root);
     546           0 :         if (down_read_trylock(&root_anon_vma->rwsem)) {
     547             :                 /*
     548             :                  * If the folio is still mapped, then this anon_vma is still
     549             :                  * its anon_vma, and holding the mutex ensures that it will
     550             :                  * not go away, see anon_vma_free().
     551             :                  */
     552           0 :                 if (!folio_mapped(folio)) {
     553           0 :                         up_read(&root_anon_vma->rwsem);
     554           0 :                         anon_vma = NULL;
     555             :                 }
     556             :                 goto out;
     557             :         }
     558             : 
     559             :         /* trylock failed, we got to sleep */
     560           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     561             :                 anon_vma = NULL;
     562             :                 goto out;
     563             :         }
     564             : 
     565           0 :         if (!folio_mapped(folio)) {
     566           0 :                 rcu_read_unlock();
     567             :                 put_anon_vma(anon_vma);
     568             :                 return NULL;
     569             :         }
     570             : 
     571             :         /* we pinned the anon_vma, its safe to sleep */
     572             :         rcu_read_unlock();
     573           0 :         anon_vma_lock_read(anon_vma);
     574             : 
     575           0 :         if (atomic_dec_and_test(&anon_vma->refcount)) {
     576             :                 /*
     577             :                  * Oops, we held the last refcount, release the lock
     578             :                  * and bail -- can't simply use put_anon_vma() because
     579             :                  * we'll deadlock on the anon_vma_lock_write() recursion.
     580             :                  */
     581           0 :                 anon_vma_unlock_read(anon_vma);
     582           0 :                 __put_anon_vma(anon_vma);
     583           0 :                 anon_vma = NULL;
     584             :         }
     585             : 
     586             :         return anon_vma;
     587             : 
     588             : out:
     589             :         rcu_read_unlock();
     590           0 :         return anon_vma;
     591             : }
     592             : 
     593           0 : void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
     594             : {
     595           0 :         anon_vma_unlock_read(anon_vma);
     596           0 : }
     597             : 
     598             : #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
     599             : /*
     600             :  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
     601             :  * important if a PTE was dirty when it was unmapped that it's flushed
     602             :  * before any IO is initiated on the page to prevent lost writes. Similarly,
     603             :  * it must be flushed before freeing to prevent data leakage.
     604             :  */
     605             : void try_to_unmap_flush(void)
     606             : {
     607             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     608             : 
     609             :         if (!tlb_ubc->flush_required)
     610             :                 return;
     611             : 
     612             :         arch_tlbbatch_flush(&tlb_ubc->arch);
     613             :         tlb_ubc->flush_required = false;
     614             :         tlb_ubc->writable = false;
     615             : }
     616             : 
     617             : /* Flush iff there are potentially writable TLB entries that can race with IO */
     618             : void try_to_unmap_flush_dirty(void)
     619             : {
     620             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     621             : 
     622             :         if (tlb_ubc->writable)
     623             :                 try_to_unmap_flush();
     624             : }
     625             : 
     626             : /*
     627             :  * Bits 0-14 of mm->tlb_flush_batched record pending generations.
     628             :  * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
     629             :  */
     630             : #define TLB_FLUSH_BATCH_FLUSHED_SHIFT   16
     631             : #define TLB_FLUSH_BATCH_PENDING_MASK                    \
     632             :         ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
     633             : #define TLB_FLUSH_BATCH_PENDING_LARGE                   \
     634             :         (TLB_FLUSH_BATCH_PENDING_MASK / 2)
     635             : 
     636             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
     637             : {
     638             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     639             :         int batch, nbatch;
     640             : 
     641             :         arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
     642             :         tlb_ubc->flush_required = true;
     643             : 
     644             :         /*
     645             :          * Ensure compiler does not re-order the setting of tlb_flush_batched
     646             :          * before the PTE is cleared.
     647             :          */
     648             :         barrier();
     649             :         batch = atomic_read(&mm->tlb_flush_batched);
     650             : retry:
     651             :         if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
     652             :                 /*
     653             :                  * Prevent `pending' from catching up with `flushed' because of
     654             :                  * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
     655             :                  * `pending' becomes large.
     656             :                  */
     657             :                 nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
     658             :                 if (nbatch != batch) {
     659             :                         batch = nbatch;
     660             :                         goto retry;
     661             :                 }
     662             :         } else {
     663             :                 atomic_inc(&mm->tlb_flush_batched);
     664             :         }
     665             : 
     666             :         /*
     667             :          * If the PTE was dirty then it's best to assume it's writable. The
     668             :          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
     669             :          * before the page is queued for IO.
     670             :          */
     671             :         if (writable)
     672             :                 tlb_ubc->writable = true;
     673             : }
     674             : 
     675             : /*
     676             :  * Returns true if the TLB flush should be deferred to the end of a batch of
     677             :  * unmap operations to reduce IPIs.
     678             :  */
     679             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     680             : {
     681             :         bool should_defer = false;
     682             : 
     683             :         if (!(flags & TTU_BATCH_FLUSH))
     684             :                 return false;
     685             : 
     686             :         /* If remote CPUs need to be flushed then defer batch the flush */
     687             :         if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
     688             :                 should_defer = true;
     689             :         put_cpu();
     690             : 
     691             :         return should_defer;
     692             : }
     693             : 
     694             : /*
     695             :  * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
     696             :  * releasing the PTL if TLB flushes are batched. It's possible for a parallel
     697             :  * operation such as mprotect or munmap to race between reclaim unmapping
     698             :  * the page and flushing the page. If this race occurs, it potentially allows
     699             :  * access to data via a stale TLB entry. Tracking all mm's that have TLB
     700             :  * batching in flight would be expensive during reclaim so instead track
     701             :  * whether TLB batching occurred in the past and if so then do a flush here
     702             :  * if required. This will cost one additional flush per reclaim cycle paid
     703             :  * by the first operation at risk such as mprotect and mumap.
     704             :  *
     705             :  * This must be called under the PTL so that an access to tlb_flush_batched
     706             :  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
     707             :  * via the PTL.
     708             :  */
     709             : void flush_tlb_batched_pending(struct mm_struct *mm)
     710             : {
     711             :         int batch = atomic_read(&mm->tlb_flush_batched);
     712             :         int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
     713             :         int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
     714             : 
     715             :         if (pending != flushed) {
     716             :                 flush_tlb_mm(mm);
     717             :                 /*
     718             :                  * If the new TLB flushing is pending during flushing, leave
     719             :                  * mm->tlb_flush_batched as is, to avoid losing flushing.
     720             :                  */
     721             :                 atomic_cmpxchg(&mm->tlb_flush_batched, batch,
     722             :                                pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
     723             :         }
     724             : }
     725             : #else
     726             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
     727             : {
     728             : }
     729             : 
     730             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     731             : {
     732             :         return false;
     733             : }
     734             : #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
     735             : 
     736             : /*
     737             :  * At what user virtual address is page expected in vma?
     738             :  * Caller should check the page is actually part of the vma.
     739             :  */
     740           0 : unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
     741             : {
     742           0 :         struct folio *folio = page_folio(page);
     743           0 :         if (folio_test_anon(folio)) {
     744           0 :                 struct anon_vma *page__anon_vma = folio_anon_vma(folio);
     745             :                 /*
     746             :                  * Note: swapoff's unuse_vma() is more efficient with this
     747             :                  * check, and needs it to match anon_vma when KSM is active.
     748             :                  */
     749           0 :                 if (!vma->anon_vma || !page__anon_vma ||
     750           0 :                     vma->anon_vma->root != page__anon_vma->root)
     751             :                         return -EFAULT;
     752           0 :         } else if (!vma->vm_file) {
     753             :                 return -EFAULT;
     754           0 :         } else if (vma->vm_file->f_mapping != folio->mapping) {
     755             :                 return -EFAULT;
     756             :         }
     757             : 
     758           0 :         return vma_address(page, vma);
     759             : }
     760             : 
     761           0 : pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
     762             : {
     763             :         pgd_t *pgd;
     764             :         p4d_t *p4d;
     765             :         pud_t *pud;
     766           0 :         pmd_t *pmd = NULL;
     767             :         pmd_t pmde;
     768             : 
     769           0 :         pgd = pgd_offset(mm, address);
     770             :         if (!pgd_present(*pgd))
     771             :                 goto out;
     772             : 
     773           0 :         p4d = p4d_offset(pgd, address);
     774             :         if (!p4d_present(*p4d))
     775             :                 goto out;
     776             : 
     777           0 :         pud = pud_offset(p4d, address);
     778           0 :         if (!pud_present(*pud))
     779             :                 goto out;
     780             : 
     781           0 :         pmd = pmd_offset(pud, address);
     782             :         /*
     783             :          * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
     784             :          * without holding anon_vma lock for write.  So when looking for a
     785             :          * genuine pmde (in which to find pte), test present and !THP together.
     786             :          */
     787           0 :         pmde = *pmd;
     788           0 :         barrier();
     789           0 :         if (!pmd_present(pmde) || pmd_trans_huge(pmde))
     790             :                 pmd = NULL;
     791             : out:
     792           0 :         return pmd;
     793             : }
     794             : 
     795             : struct folio_referenced_arg {
     796             :         int mapcount;
     797             :         int referenced;
     798             :         unsigned long vm_flags;
     799             :         struct mem_cgroup *memcg;
     800             : };
     801             : /*
     802             :  * arg: folio_referenced_arg will be passed
     803             :  */
     804           0 : static bool folio_referenced_one(struct folio *folio,
     805             :                 struct vm_area_struct *vma, unsigned long address, void *arg)
     806             : {
     807           0 :         struct folio_referenced_arg *pra = arg;
     808           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
     809           0 :         int referenced = 0;
     810             : 
     811           0 :         while (page_vma_mapped_walk(&pvmw)) {
     812           0 :                 address = pvmw.address;
     813             : 
     814           0 :                 if ((vma->vm_flags & VM_LOCKED) &&
     815           0 :                     (!folio_test_large(folio) || !pvmw.pte)) {
     816             :                         /* Restore the mlock which got missed */
     817           0 :                         mlock_vma_folio(folio, vma, !pvmw.pte);
     818           0 :                         page_vma_mapped_walk_done(&pvmw);
     819           0 :                         pra->vm_flags |= VM_LOCKED;
     820           0 :                         return false; /* To break the loop */
     821             :                 }
     822             : 
     823           0 :                 if (pvmw.pte) {
     824           0 :                         if (ptep_clear_flush_young_notify(vma, address,
     825             :                                                 pvmw.pte)) {
     826             :                                 /*
     827             :                                  * Don't treat a reference through
     828             :                                  * a sequentially read mapping as such.
     829             :                                  * If the folio has been used in another mapping,
     830             :                                  * we will catch it; if this other mapping is
     831             :                                  * already gone, the unmap path will have set
     832             :                                  * the referenced flag or activated the folio.
     833             :                                  */
     834           0 :                                 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
     835           0 :                                         referenced++;
     836             :                         }
     837             :                 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
     838             :                         if (pmdp_clear_flush_young_notify(vma, address,
     839             :                                                 pvmw.pmd))
     840             :                                 referenced++;
     841             :                 } else {
     842             :                         /* unexpected pmd-mapped folio? */
     843           0 :                         WARN_ON_ONCE(1);
     844             :                 }
     845             : 
     846           0 :                 pra->mapcount--;
     847             :         }
     848             : 
     849             :         if (referenced)
     850             :                 folio_clear_idle(folio);
     851           0 :         if (folio_test_clear_young(folio))
     852             :                 referenced++;
     853             : 
     854           0 :         if (referenced) {
     855           0 :                 pra->referenced++;
     856           0 :                 pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
     857             :         }
     858             : 
     859           0 :         if (!pra->mapcount)
     860             :                 return false; /* To break the loop */
     861             : 
     862           0 :         return true;
     863             : }
     864             : 
     865           0 : static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
     866             : {
     867           0 :         struct folio_referenced_arg *pra = arg;
     868           0 :         struct mem_cgroup *memcg = pra->memcg;
     869             : 
     870           0 :         if (!mm_match_cgroup(vma->vm_mm, memcg))
     871             :                 return true;
     872             : 
     873             :         return false;
     874             : }
     875             : 
     876             : /**
     877             :  * folio_referenced() - Test if the folio was referenced.
     878             :  * @folio: The folio to test.
     879             :  * @is_locked: Caller holds lock on the folio.
     880             :  * @memcg: target memory cgroup
     881             :  * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
     882             :  *
     883             :  * Quick test_and_clear_referenced for all mappings of a folio,
     884             :  *
     885             :  * Return: The number of mappings which referenced the folio.
     886             :  */
     887           0 : int folio_referenced(struct folio *folio, int is_locked,
     888             :                      struct mem_cgroup *memcg, unsigned long *vm_flags)
     889             : {
     890           0 :         int we_locked = 0;
     891           0 :         struct folio_referenced_arg pra = {
     892           0 :                 .mapcount = folio_mapcount(folio),
     893             :                 .memcg = memcg,
     894             :         };
     895           0 :         struct rmap_walk_control rwc = {
     896             :                 .rmap_one = folio_referenced_one,
     897             :                 .arg = (void *)&pra,
     898             :                 .anon_lock = folio_lock_anon_vma_read,
     899             :         };
     900             : 
     901           0 :         *vm_flags = 0;
     902           0 :         if (!pra.mapcount)
     903             :                 return 0;
     904             : 
     905           0 :         if (!folio_raw_mapping(folio))
     906             :                 return 0;
     907             : 
     908           0 :         if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
     909           0 :                 we_locked = folio_trylock(folio);
     910           0 :                 if (!we_locked)
     911             :                         return 1;
     912             :         }
     913             : 
     914             :         /*
     915             :          * If we are reclaiming on behalf of a cgroup, skip
     916             :          * counting on behalf of references from different
     917             :          * cgroups
     918             :          */
     919           0 :         if (memcg) {
     920           0 :                 rwc.invalid_vma = invalid_folio_referenced_vma;
     921             :         }
     922             : 
     923           0 :         rmap_walk(folio, &rwc);
     924           0 :         *vm_flags = pra.vm_flags;
     925             : 
     926           0 :         if (we_locked)
     927           0 :                 folio_unlock(folio);
     928             : 
     929           0 :         return pra.referenced;
     930             : }
     931             : 
     932           0 : static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
     933             :                             unsigned long address, void *arg)
     934             : {
     935           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
     936             :         struct mmu_notifier_range range;
     937           0 :         int *cleaned = arg;
     938             : 
     939             :         /*
     940             :          * We have to assume the worse case ie pmd for invalidation. Note that
     941             :          * the folio can not be freed from this function.
     942             :          */
     943             :         mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
     944             :                                 0, vma, vma->vm_mm, address,
     945             :                                 vma_address_end(&pvmw));
     946             :         mmu_notifier_invalidate_range_start(&range);
     947             : 
     948           0 :         while (page_vma_mapped_walk(&pvmw)) {
     949           0 :                 int ret = 0;
     950             : 
     951           0 :                 address = pvmw.address;
     952           0 :                 if (pvmw.pte) {
     953             :                         pte_t entry;
     954           0 :                         pte_t *pte = pvmw.pte;
     955             : 
     956           0 :                         if (!pte_dirty(*pte) && !pte_write(*pte))
     957           0 :                                 continue;
     958             : 
     959           0 :                         flush_cache_page(vma, address, pte_pfn(*pte));
     960           0 :                         entry = ptep_clear_flush(vma, address, pte);
     961           0 :                         entry = pte_wrprotect(entry);
     962           0 :                         entry = pte_mkclean(entry);
     963           0 :                         set_pte_at(vma->vm_mm, address, pte, entry);
     964             :                         ret = 1;
     965             :                 } else {
     966             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     967             :                         pmd_t *pmd = pvmw.pmd;
     968             :                         pmd_t entry;
     969             : 
     970             :                         if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
     971             :                                 continue;
     972             : 
     973             :                         flush_cache_page(vma, address, folio_pfn(folio));
     974             :                         entry = pmdp_invalidate(vma, address, pmd);
     975             :                         entry = pmd_wrprotect(entry);
     976             :                         entry = pmd_mkclean(entry);
     977             :                         set_pmd_at(vma->vm_mm, address, pmd, entry);
     978             :                         ret = 1;
     979             : #else
     980             :                         /* unexpected pmd-mapped folio? */
     981           0 :                         WARN_ON_ONCE(1);
     982             : #endif
     983             :                 }
     984             : 
     985             :                 /*
     986             :                  * No need to call mmu_notifier_invalidate_range() as we are
     987             :                  * downgrading page table protection not changing it to point
     988             :                  * to a new page.
     989             :                  *
     990             :                  * See Documentation/vm/mmu_notifier.rst
     991             :                  */
     992           0 :                 if (ret)
     993           0 :                         (*cleaned)++;
     994             :         }
     995             : 
     996           0 :         mmu_notifier_invalidate_range_end(&range);
     997             : 
     998           0 :         return true;
     999             : }
    1000             : 
    1001           0 : static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
    1002             : {
    1003           0 :         if (vma->vm_flags & VM_SHARED)
    1004             :                 return false;
    1005             : 
    1006           0 :         return true;
    1007             : }
    1008             : 
    1009           0 : int folio_mkclean(struct folio *folio)
    1010             : {
    1011           0 :         int cleaned = 0;
    1012             :         struct address_space *mapping;
    1013           0 :         struct rmap_walk_control rwc = {
    1014             :                 .arg = (void *)&cleaned,
    1015             :                 .rmap_one = page_mkclean_one,
    1016             :                 .invalid_vma = invalid_mkclean_vma,
    1017             :         };
    1018             : 
    1019           0 :         BUG_ON(!folio_test_locked(folio));
    1020             : 
    1021           0 :         if (!folio_mapped(folio))
    1022             :                 return 0;
    1023             : 
    1024           0 :         mapping = folio_mapping(folio);
    1025           0 :         if (!mapping)
    1026             :                 return 0;
    1027             : 
    1028           0 :         rmap_walk(folio, &rwc);
    1029             : 
    1030           0 :         return cleaned;
    1031             : }
    1032             : EXPORT_SYMBOL_GPL(folio_mkclean);
    1033             : 
    1034             : /**
    1035             :  * page_move_anon_rmap - move a page to our anon_vma
    1036             :  * @page:       the page to move to our anon_vma
    1037             :  * @vma:        the vma the page belongs to
    1038             :  *
    1039             :  * When a page belongs exclusively to one process after a COW event,
    1040             :  * that page can be moved into the anon_vma that belongs to just that
    1041             :  * process, so the rmap code will not search the parent or sibling
    1042             :  * processes.
    1043             :  */
    1044           0 : void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
    1045             : {
    1046           0 :         struct anon_vma *anon_vma = vma->anon_vma;
    1047             : 
    1048           0 :         page = compound_head(page);
    1049             : 
    1050             :         VM_BUG_ON_PAGE(!PageLocked(page), page);
    1051             :         VM_BUG_ON_VMA(!anon_vma, vma);
    1052             : 
    1053           0 :         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
    1054             :         /*
    1055             :          * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
    1056             :          * simultaneously, so a concurrent reader (eg folio_referenced()'s
    1057             :          * folio_test_anon()) will not see one without the other.
    1058             :          */
    1059           0 :         WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
    1060           0 : }
    1061             : 
    1062             : /**
    1063             :  * __page_set_anon_rmap - set up new anonymous rmap
    1064             :  * @page:       Page or Hugepage to add to rmap
    1065             :  * @vma:        VM area to add page to.
    1066             :  * @address:    User virtual address of the mapping     
    1067             :  * @exclusive:  the page is exclusively owned by the current process
    1068             :  */
    1069           0 : static void __page_set_anon_rmap(struct page *page,
    1070             :         struct vm_area_struct *vma, unsigned long address, int exclusive)
    1071             : {
    1072           0 :         struct anon_vma *anon_vma = vma->anon_vma;
    1073             : 
    1074           0 :         BUG_ON(!anon_vma);
    1075             : 
    1076           0 :         if (PageAnon(page))
    1077             :                 return;
    1078             : 
    1079             :         /*
    1080             :          * If the page isn't exclusively mapped into this vma,
    1081             :          * we must use the _oldest_ possible anon_vma for the
    1082             :          * page mapping!
    1083             :          */
    1084           0 :         if (!exclusive)
    1085           0 :                 anon_vma = anon_vma->root;
    1086             : 
    1087             :         /*
    1088             :          * page_idle does a lockless/optimistic rmap scan on page->mapping.
    1089             :          * Make sure the compiler doesn't split the stores of anon_vma and
    1090             :          * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
    1091             :          * could mistake the mapping for a struct address_space and crash.
    1092             :          */
    1093           0 :         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
    1094           0 :         WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
    1095           0 :         page->index = linear_page_index(vma, address);
    1096             : }
    1097             : 
    1098             : /**
    1099             :  * __page_check_anon_rmap - sanity check anonymous rmap addition
    1100             :  * @page:       the page to add the mapping to
    1101             :  * @vma:        the vm area in which the mapping is added
    1102             :  * @address:    the user virtual address mapped
    1103             :  */
    1104             : static void __page_check_anon_rmap(struct page *page,
    1105             :         struct vm_area_struct *vma, unsigned long address)
    1106             : {
    1107           0 :         struct folio *folio = page_folio(page);
    1108             :         /*
    1109             :          * The page's anon-rmap details (mapping and index) are guaranteed to
    1110             :          * be set up correctly at this point.
    1111             :          *
    1112             :          * We have exclusion against page_add_anon_rmap because the caller
    1113             :          * always holds the page locked.
    1114             :          *
    1115             :          * We have exclusion against page_add_new_anon_rmap because those pages
    1116             :          * are initially only visible via the pagetables, and the pte is locked
    1117             :          * over the call to page_add_new_anon_rmap.
    1118             :          */
    1119             :         VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
    1120             :                         folio);
    1121             :         VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
    1122             :                        page);
    1123             : }
    1124             : 
    1125             : /**
    1126             :  * page_add_anon_rmap - add pte mapping to an anonymous page
    1127             :  * @page:       the page to add the mapping to
    1128             :  * @vma:        the vm area in which the mapping is added
    1129             :  * @address:    the user virtual address mapped
    1130             :  * @compound:   charge the page as compound or small page
    1131             :  *
    1132             :  * The caller needs to hold the pte lock, and the page must be locked in
    1133             :  * the anon_vma case: to serialize mapping,index checking after setting,
    1134             :  * and to ensure that PageAnon is not being upgraded racily to PageKsm
    1135             :  * (but PageKsm is never downgraded to PageAnon).
    1136             :  */
    1137           0 : void page_add_anon_rmap(struct page *page,
    1138             :         struct vm_area_struct *vma, unsigned long address, bool compound)
    1139             : {
    1140           0 :         do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
    1141           0 : }
    1142             : 
    1143             : /*
    1144             :  * Special version of the above for do_swap_page, which often runs
    1145             :  * into pages that are exclusively owned by the current process.
    1146             :  * Everybody else should continue to use page_add_anon_rmap above.
    1147             :  */
    1148           0 : void do_page_add_anon_rmap(struct page *page,
    1149             :         struct vm_area_struct *vma, unsigned long address, int flags)
    1150             : {
    1151           0 :         bool compound = flags & RMAP_COMPOUND;
    1152             :         bool first;
    1153             : 
    1154           0 :         if (unlikely(PageKsm(page)))
    1155             :                 lock_page_memcg(page);
    1156             :         else
    1157             :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    1158             : 
    1159           0 :         if (compound) {
    1160             :                 atomic_t *mapcount;
    1161             :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    1162             :                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
    1163           0 :                 mapcount = compound_mapcount_ptr(page);
    1164           0 :                 first = atomic_inc_and_test(mapcount);
    1165             :         } else {
    1166           0 :                 first = atomic_inc_and_test(&page->_mapcount);
    1167             :         }
    1168             : 
    1169           0 :         if (first) {
    1170           0 :                 int nr = compound ? thp_nr_pages(page) : 1;
    1171             :                 /*
    1172             :                  * We use the irq-unsafe __{inc|mod}_zone_page_stat because
    1173             :                  * these counters are not modified in interrupt context, and
    1174             :                  * pte lock(a spinlock) is held, which implies preemption
    1175             :                  * disabled.
    1176             :                  */
    1177           0 :                 if (compound)
    1178           0 :                         __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
    1179           0 :                 __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
    1180             :         }
    1181             : 
    1182           0 :         if (unlikely(PageKsm(page)))
    1183             :                 unlock_page_memcg(page);
    1184             : 
    1185             :         /* address might be in next vma when migration races vma_adjust */
    1186           0 :         else if (first)
    1187           0 :                 __page_set_anon_rmap(page, vma, address,
    1188             :                                 flags & RMAP_EXCLUSIVE);
    1189             :         else
    1190           0 :                 __page_check_anon_rmap(page, vma, address);
    1191             : 
    1192           0 :         mlock_vma_page(page, vma, compound);
    1193           0 : }
    1194             : 
    1195             : /**
    1196             :  * page_add_new_anon_rmap - add pte mapping to a new anonymous page
    1197             :  * @page:       the page to add the mapping to
    1198             :  * @vma:        the vm area in which the mapping is added
    1199             :  * @address:    the user virtual address mapped
    1200             :  * @compound:   charge the page as compound or small page
    1201             :  *
    1202             :  * Same as page_add_anon_rmap but must only be called on *new* pages.
    1203             :  * This means the inc-and-test can be bypassed.
    1204             :  * Page does not have to be locked.
    1205             :  */
    1206           0 : void page_add_new_anon_rmap(struct page *page,
    1207             :         struct vm_area_struct *vma, unsigned long address, bool compound)
    1208             : {
    1209           0 :         int nr = compound ? thp_nr_pages(page) : 1;
    1210             : 
    1211             :         VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
    1212           0 :         __SetPageSwapBacked(page);
    1213           0 :         if (compound) {
    1214             :                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
    1215             :                 /* increment count (starts at -1) */
    1216           0 :                 atomic_set(compound_mapcount_ptr(page), 0);
    1217           0 :                 atomic_set(compound_pincount_ptr(page), 0);
    1218             : 
    1219           0 :                 __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
    1220             :         } else {
    1221             :                 /* Anon THP always mapped first with PMD */
    1222             :                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
    1223             :                 /* increment count (starts at -1) */
    1224           0 :                 atomic_set(&page->_mapcount, 0);
    1225             :         }
    1226           0 :         __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
    1227           0 :         __page_set_anon_rmap(page, vma, address, 1);
    1228           0 : }
    1229             : 
    1230             : /**
    1231             :  * page_add_file_rmap - add pte mapping to a file page
    1232             :  * @page:       the page to add the mapping to
    1233             :  * @vma:        the vm area in which the mapping is added
    1234             :  * @compound:   charge the page as compound or small page
    1235             :  *
    1236             :  * The caller needs to hold the pte lock.
    1237             :  */
    1238           0 : void page_add_file_rmap(struct page *page,
    1239             :         struct vm_area_struct *vma, bool compound)
    1240             : {
    1241           0 :         int i, nr = 0;
    1242             : 
    1243             :         VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
    1244           0 :         lock_page_memcg(page);
    1245             :         if (compound && PageTransHuge(page)) {
    1246             :                 int nr_pages = thp_nr_pages(page);
    1247             : 
    1248             :                 for (i = 0; i < nr_pages; i++) {
    1249             :                         if (atomic_inc_and_test(&page[i]._mapcount))
    1250             :                                 nr++;
    1251             :                 }
    1252             :                 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
    1253             :                         goto out;
    1254             : 
    1255             :                 /*
    1256             :                  * It is racy to ClearPageDoubleMap in page_remove_file_rmap();
    1257             :                  * but page lock is held by all page_add_file_rmap() compound
    1258             :                  * callers, and SetPageDoubleMap below warns if !PageLocked:
    1259             :                  * so here is a place that DoubleMap can be safely cleared.
    1260             :                  */
    1261             :                 VM_WARN_ON_ONCE(!PageLocked(page));
    1262             :                 if (nr == nr_pages && PageDoubleMap(page))
    1263             :                         ClearPageDoubleMap(page);
    1264             : 
    1265             :                 if (PageSwapBacked(page))
    1266             :                         __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
    1267             :                                                 nr_pages);
    1268             :                 else
    1269             :                         __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
    1270             :                                                 nr_pages);
    1271             :         } else {
    1272           0 :                 if (PageTransCompound(page) && page_mapping(page)) {
    1273             :                         VM_WARN_ON_ONCE(!PageLocked(page));
    1274             :                         SetPageDoubleMap(compound_head(page));
    1275             :                 }
    1276           0 :                 if (atomic_inc_and_test(&page->_mapcount))
    1277           0 :                         nr++;
    1278             :         }
    1279             : out:
    1280           0 :         if (nr)
    1281           0 :                 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
    1282           0 :         unlock_page_memcg(page);
    1283             : 
    1284           0 :         mlock_vma_page(page, vma, compound);
    1285           0 : }
    1286             : 
    1287             : static void page_remove_file_rmap(struct page *page, bool compound)
    1288             : {
    1289           0 :         int i, nr = 0;
    1290             : 
    1291             :         VM_BUG_ON_PAGE(compound && !PageHead(page), page);
    1292             : 
    1293             :         /* Hugepages are not counted in NR_FILE_MAPPED for now. */
    1294           0 :         if (unlikely(PageHuge(page))) {
    1295             :                 /* hugetlb pages are always mapped with pmds */
    1296             :                 atomic_dec(compound_mapcount_ptr(page));
    1297             :                 return;
    1298             :         }
    1299             : 
    1300             :         /* page still mapped by someone else? */
    1301             :         if (compound && PageTransHuge(page)) {
    1302             :                 int nr_pages = thp_nr_pages(page);
    1303             : 
    1304             :                 for (i = 0; i < nr_pages; i++) {
    1305             :                         if (atomic_add_negative(-1, &page[i]._mapcount))
    1306             :                                 nr++;
    1307             :                 }
    1308             :                 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
    1309             :                         goto out;
    1310             :                 if (PageSwapBacked(page))
    1311             :                         __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
    1312             :                                                 -nr_pages);
    1313             :                 else
    1314             :                         __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
    1315             :                                                 -nr_pages);
    1316             :         } else {
    1317           0 :                 if (atomic_add_negative(-1, &page->_mapcount))
    1318           0 :                         nr++;
    1319             :         }
    1320             : out:
    1321           0 :         if (nr)
    1322           0 :                 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
    1323             : }
    1324             : 
    1325             : static void page_remove_anon_compound_rmap(struct page *page)
    1326             : {
    1327             :         int i, nr;
    1328             : 
    1329           0 :         if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
    1330             :                 return;
    1331             : 
    1332             :         /* Hugepages are not counted in NR_ANON_PAGES for now. */
    1333             :         if (unlikely(PageHuge(page)))
    1334             :                 return;
    1335             : 
    1336             :         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
    1337             :                 return;
    1338             : 
    1339             :         __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
    1340             : 
    1341             :         if (TestClearPageDoubleMap(page)) {
    1342             :                 /*
    1343             :                  * Subpages can be mapped with PTEs too. Check how many of
    1344             :                  * them are still mapped.
    1345             :                  */
    1346             :                 for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
    1347             :                         if (atomic_add_negative(-1, &page[i]._mapcount))
    1348             :                                 nr++;
    1349             :                 }
    1350             : 
    1351             :                 /*
    1352             :                  * Queue the page for deferred split if at least one small
    1353             :                  * page of the compound page is unmapped, but at least one
    1354             :                  * small page is still mapped.
    1355             :                  */
    1356             :                 if (nr && nr < thp_nr_pages(page))
    1357             :                         deferred_split_huge_page(page);
    1358             :         } else {
    1359             :                 nr = thp_nr_pages(page);
    1360             :         }
    1361             : 
    1362             :         if (nr)
    1363             :                 __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
    1364             : }
    1365             : 
    1366             : /**
    1367             :  * page_remove_rmap - take down pte mapping from a page
    1368             :  * @page:       page to remove mapping from
    1369             :  * @vma:        the vm area from which the mapping is removed
    1370             :  * @compound:   uncharge the page as compound or small page
    1371             :  *
    1372             :  * The caller needs to hold the pte lock.
    1373             :  */
    1374           0 : void page_remove_rmap(struct page *page,
    1375             :         struct vm_area_struct *vma, bool compound)
    1376             : {
    1377           0 :         lock_page_memcg(page);
    1378             : 
    1379           0 :         if (!PageAnon(page)) {
    1380           0 :                 page_remove_file_rmap(page, compound);
    1381             :                 goto out;
    1382             :         }
    1383             : 
    1384           0 :         if (compound) {
    1385             :                 page_remove_anon_compound_rmap(page);
    1386             :                 goto out;
    1387             :         }
    1388             : 
    1389             :         /* page still mapped by someone else? */
    1390           0 :         if (!atomic_add_negative(-1, &page->_mapcount))
    1391             :                 goto out;
    1392             : 
    1393             :         /*
    1394             :          * We use the irq-unsafe __{inc|mod}_zone_page_stat because
    1395             :          * these counters are not modified in interrupt context, and
    1396             :          * pte lock(a spinlock) is held, which implies preemption disabled.
    1397             :          */
    1398           0 :         __dec_lruvec_page_state(page, NR_ANON_MAPPED);
    1399             : 
    1400           0 :         if (PageTransCompound(page))
    1401             :                 deferred_split_huge_page(compound_head(page));
    1402             : 
    1403             :         /*
    1404             :          * It would be tidy to reset the PageAnon mapping here,
    1405             :          * but that might overwrite a racing page_add_anon_rmap
    1406             :          * which increments mapcount after us but sets mapping
    1407             :          * before us: so leave the reset to free_unref_page,
    1408             :          * and remember that it's only reliable while mapped.
    1409             :          * Leaving it set also helps swapoff to reinstate ptes
    1410             :          * faster for those pages still in swapcache.
    1411             :          */
    1412             : out:
    1413           0 :         unlock_page_memcg(page);
    1414             : 
    1415           0 :         munlock_vma_page(page, vma, compound);
    1416           0 : }
    1417             : 
    1418             : /*
    1419             :  * @arg: enum ttu_flags will be passed to this argument
    1420             :  */
    1421           0 : static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
    1422             :                      unsigned long address, void *arg)
    1423             : {
    1424           0 :         struct mm_struct *mm = vma->vm_mm;
    1425           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    1426             :         pte_t pteval;
    1427             :         struct page *subpage;
    1428           0 :         bool ret = true;
    1429             :         struct mmu_notifier_range range;
    1430           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1431             : 
    1432             :         /*
    1433             :          * When racing against e.g. zap_pte_range() on another cpu,
    1434             :          * in between its ptep_get_and_clear_full() and page_remove_rmap(),
    1435             :          * try_to_unmap() may return before page_mapped() has become false,
    1436             :          * if page table locking is skipped: use TTU_SYNC to wait for that.
    1437             :          */
    1438           0 :         if (flags & TTU_SYNC)
    1439           0 :                 pvmw.flags = PVMW_SYNC;
    1440             : 
    1441             :         if (flags & TTU_SPLIT_HUGE_PMD)
    1442             :                 split_huge_pmd_address(vma, address, false, folio);
    1443             : 
    1444             :         /*
    1445             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1446             :          * For hugetlb, it could be much worse if we need to do pud
    1447             :          * invalidation in the case of pmd sharing.
    1448             :          *
    1449             :          * Note that the folio can not be freed in this function as call of
    1450             :          * try_to_unmap() must hold a reference on the folio.
    1451             :          */
    1452             :         range.end = vma_address_end(&pvmw);
    1453             :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    1454             :                                 address, range.end);
    1455             :         if (folio_test_hugetlb(folio)) {
    1456             :                 /*
    1457             :                  * If sharing is possible, start and end will be adjusted
    1458             :                  * accordingly.
    1459             :                  */
    1460             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1461             :                                                      &range.end);
    1462             :         }
    1463             :         mmu_notifier_invalidate_range_start(&range);
    1464             : 
    1465           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1466             :                 /* Unexpected PMD-mapped THP? */
    1467             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    1468             : 
    1469             :                 /*
    1470             :                  * If the folio is in an mlock()d vma, we must not swap it out.
    1471             :                  */
    1472           0 :                 if (!(flags & TTU_IGNORE_MLOCK) &&
    1473           0 :                     (vma->vm_flags & VM_LOCKED)) {
    1474             :                         /* Restore the mlock which got missed */
    1475           0 :                         mlock_vma_folio(folio, vma, false);
    1476           0 :                         page_vma_mapped_walk_done(&pvmw);
    1477             :                         ret = false;
    1478             :                         break;
    1479             :                 }
    1480             : 
    1481           0 :                 subpage = folio_page(folio,
    1482             :                                         pte_pfn(*pvmw.pte) - folio_pfn(folio));
    1483           0 :                 address = pvmw.address;
    1484             : 
    1485           0 :                 if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
    1486             :                         /*
    1487             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1488             :                          * held in write mode.  Caller needs to explicitly
    1489             :                          * do this outside rmap routines.
    1490             :                          */
    1491             :                         VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1492             :                         if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
    1493             :                                 /*
    1494             :                                  * huge_pmd_unshare unmapped an entire PMD
    1495             :                                  * page.  There is no way of knowing exactly
    1496             :                                  * which PMDs may be cached for this mm, so
    1497             :                                  * we must flush them all.  start/end were
    1498             :                                  * already adjusted above to cover this range.
    1499             :                                  */
    1500             :                                 flush_cache_range(vma, range.start, range.end);
    1501             :                                 flush_tlb_range(vma, range.start, range.end);
    1502             :                                 mmu_notifier_invalidate_range(mm, range.start,
    1503             :                                                               range.end);
    1504             : 
    1505             :                                 /*
    1506             :                                  * The ref count of the PMD page was dropped
    1507             :                                  * which is part of the way map counting
    1508             :                                  * is done for shared PMDs.  Return 'true'
    1509             :                                  * here.  When there is no other sharing,
    1510             :                                  * huge_pmd_unshare returns false and we will
    1511             :                                  * unmap the actual page and drop map count
    1512             :                                  * to zero.
    1513             :                                  */
    1514             :                                 page_vma_mapped_walk_done(&pvmw);
    1515             :                                 break;
    1516             :                         }
    1517             :                 }
    1518             : 
    1519             :                 /* Nuke the page table entry. */
    1520           0 :                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1521           0 :                 if (should_defer_flush(mm, flags)) {
    1522             :                         /*
    1523             :                          * We clear the PTE but do not flush so potentially
    1524             :                          * a remote CPU could still be writing to the folio.
    1525             :                          * If the entry was previously clean then the
    1526             :                          * architecture must guarantee that a clear->dirty
    1527             :                          * transition on a cached TLB entry is written through
    1528             :                          * and traps if the PTE is unmapped.
    1529             :                          */
    1530             :                         pteval = ptep_get_and_clear(mm, address, pvmw.pte);
    1531             : 
    1532             :                         set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
    1533             :                 } else {
    1534           0 :                         pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1535             :                 }
    1536             : 
    1537             :                 /* Set the dirty flag on the folio now the pte is gone. */
    1538           0 :                 if (pte_dirty(pteval))
    1539           0 :                         folio_mark_dirty(folio);
    1540             : 
    1541             :                 /* Update high watermark before we lower rss */
    1542           0 :                 update_hiwater_rss(mm);
    1543             : 
    1544           0 :                 if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
    1545             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    1546             :                         if (folio_test_hugetlb(folio)) {
    1547             :                                 hugetlb_count_sub(folio_nr_pages(folio), mm);
    1548             :                                 set_huge_swap_pte_at(mm, address,
    1549             :                                                      pvmw.pte, pteval,
    1550             :                                                      vma_mmu_pagesize(vma));
    1551             :                         } else {
    1552             :                                 dec_mm_counter(mm, mm_counter(&folio->page));
    1553             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1554             :                         }
    1555             : 
    1556           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    1557             :                         /*
    1558             :                          * The guest indicated that the page content is of no
    1559             :                          * interest anymore. Simply discard the pte, vmscan
    1560             :                          * will take care of the rest.
    1561             :                          * A future reference will then fault in a new zero
    1562             :                          * page. When userfaultfd is active, we must not drop
    1563             :                          * this page though, as its main user (postcopy
    1564             :                          * migration) will not expect userfaults on already
    1565             :                          * copied pages.
    1566             :                          */
    1567             :                         dec_mm_counter(mm, mm_counter(&folio->page));
    1568             :                         /* We have to invalidate as we cleared the pte */
    1569             :                         mmu_notifier_invalidate_range(mm, address,
    1570             :                                                       address + PAGE_SIZE);
    1571           0 :                 } else if (folio_test_anon(folio)) {
    1572           0 :                         swp_entry_t entry = { .val = page_private(subpage) };
    1573             :                         pte_t swp_pte;
    1574             :                         /*
    1575             :                          * Store the swap location in the pte.
    1576             :                          * See handle_pte_fault() ...
    1577             :                          */
    1578           0 :                         if (unlikely(folio_test_swapbacked(folio) !=
    1579             :                                         folio_test_swapcache(folio))) {
    1580           0 :                                 WARN_ON_ONCE(1);
    1581           0 :                                 ret = false;
    1582             :                                 /* We have to invalidate as we cleared the pte */
    1583           0 :                                 mmu_notifier_invalidate_range(mm, address,
    1584             :                                                         address + PAGE_SIZE);
    1585           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1586             :                                 break;
    1587             :                         }
    1588             : 
    1589             :                         /* MADV_FREE page check */
    1590           0 :                         if (!folio_test_swapbacked(folio)) {
    1591             :                                 int ref_count, map_count;
    1592             : 
    1593             :                                 /*
    1594             :                                  * Synchronize with gup_pte_range():
    1595             :                                  * - clear PTE; barrier; read refcount
    1596             :                                  * - inc refcount; barrier; read PTE
    1597             :                                  */
    1598           0 :                                 smp_mb();
    1599             : 
    1600           0 :                                 ref_count = folio_ref_count(folio);
    1601           0 :                                 map_count = folio_mapcount(folio);
    1602             : 
    1603             :                                 /*
    1604             :                                  * Order reads for page refcount and dirty flag
    1605             :                                  * (see comments in __remove_mapping()).
    1606             :                                  */
    1607           0 :                                 smp_rmb();
    1608             : 
    1609             :                                 /*
    1610             :                                  * The only page refs must be one from isolation
    1611             :                                  * plus the rmap(s) (dropped by discard:).
    1612             :                                  */
    1613           0 :                                 if (ref_count == 1 + map_count &&
    1614           0 :                                     !folio_test_dirty(folio)) {
    1615             :                                         /* Invalidate as we cleared the pte */
    1616           0 :                                         mmu_notifier_invalidate_range(mm,
    1617             :                                                 address, address + PAGE_SIZE);
    1618           0 :                                         dec_mm_counter(mm, MM_ANONPAGES);
    1619           0 :                                         goto discard;
    1620             :                                 }
    1621             : 
    1622             :                                 /*
    1623             :                                  * If the folio was redirtied, it cannot be
    1624             :                                  * discarded. Remap the page to page table.
    1625             :                                  */
    1626           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1627           0 :                                 folio_set_swapbacked(folio);
    1628           0 :                                 ret = false;
    1629           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1630             :                                 break;
    1631             :                         }
    1632             : 
    1633           0 :                         if (swap_duplicate(entry) < 0) {
    1634           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1635           0 :                                 ret = false;
    1636           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1637             :                                 break;
    1638             :                         }
    1639           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    1640             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1641             :                                 ret = false;
    1642             :                                 page_vma_mapped_walk_done(&pvmw);
    1643             :                                 break;
    1644             :                         }
    1645           0 :                         if (list_empty(&mm->mmlist)) {
    1646           0 :                                 spin_lock(&mmlist_lock);
    1647           0 :                                 if (list_empty(&mm->mmlist))
    1648           0 :                                         list_add(&mm->mmlist, &init_mm.mmlist);
    1649             :                                 spin_unlock(&mmlist_lock);
    1650             :                         }
    1651           0 :                         dec_mm_counter(mm, MM_ANONPAGES);
    1652           0 :                         inc_mm_counter(mm, MM_SWAPENTS);
    1653           0 :                         swp_pte = swp_entry_to_pte(entry);
    1654           0 :                         if (pte_soft_dirty(pteval))
    1655             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1656             :                         if (pte_uffd_wp(pteval))
    1657             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1658           0 :                         set_pte_at(mm, address, pvmw.pte, swp_pte);
    1659             :                         /* Invalidate as we cleared the pte */
    1660           0 :                         mmu_notifier_invalidate_range(mm, address,
    1661             :                                                       address + PAGE_SIZE);
    1662             :                 } else {
    1663             :                         /*
    1664             :                          * This is a locked file-backed folio,
    1665             :                          * so it cannot be removed from the page
    1666             :                          * cache and replaced by a new folio before
    1667             :                          * mmu_notifier_invalidate_range_end, so no
    1668             :                          * concurrent thread might update its page table
    1669             :                          * to point at a new folio while a device is
    1670             :                          * still using this folio.
    1671             :                          *
    1672             :                          * See Documentation/vm/mmu_notifier.rst
    1673             :                          */
    1674           0 :                         dec_mm_counter(mm, mm_counter_file(&folio->page));
    1675             :                 }
    1676             : discard:
    1677             :                 /*
    1678             :                  * No need to call mmu_notifier_invalidate_range() it has be
    1679             :                  * done above for all cases requiring it to happen under page
    1680             :                  * table lock before mmu_notifier_invalidate_range_end()
    1681             :                  *
    1682             :                  * See Documentation/vm/mmu_notifier.rst
    1683             :                  */
    1684           0 :                 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
    1685           0 :                 if (vma->vm_flags & VM_LOCKED)
    1686           0 :                         mlock_page_drain_local();
    1687             :                 folio_put(folio);
    1688             :         }
    1689             : 
    1690           0 :         mmu_notifier_invalidate_range_end(&range);
    1691             : 
    1692           0 :         return ret;
    1693             : }
    1694             : 
    1695           0 : static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
    1696             : {
    1697           0 :         return vma_is_temporary_stack(vma);
    1698             : }
    1699             : 
    1700           0 : static int page_not_mapped(struct folio *folio)
    1701             : {
    1702           0 :         return !folio_mapped(folio);
    1703             : }
    1704             : 
    1705             : /**
    1706             :  * try_to_unmap - Try to remove all page table mappings to a folio.
    1707             :  * @folio: The folio to unmap.
    1708             :  * @flags: action and flags
    1709             :  *
    1710             :  * Tries to remove all the page table entries which are mapping this
    1711             :  * folio.  It is the caller's responsibility to check if the folio is
    1712             :  * still mapped if needed (use TTU_SYNC to prevent accounting races).
    1713             :  *
    1714             :  * Context: Caller must hold the folio lock.
    1715             :  */
    1716           0 : void try_to_unmap(struct folio *folio, enum ttu_flags flags)
    1717             : {
    1718           0 :         struct rmap_walk_control rwc = {
    1719             :                 .rmap_one = try_to_unmap_one,
    1720           0 :                 .arg = (void *)flags,
    1721             :                 .done = page_not_mapped,
    1722             :                 .anon_lock = folio_lock_anon_vma_read,
    1723             :         };
    1724             : 
    1725           0 :         if (flags & TTU_RMAP_LOCKED)
    1726           0 :                 rmap_walk_locked(folio, &rwc);
    1727             :         else
    1728           0 :                 rmap_walk(folio, &rwc);
    1729           0 : }
    1730             : 
    1731             : /*
    1732             :  * @arg: enum ttu_flags will be passed to this argument.
    1733             :  *
    1734             :  * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
    1735             :  * containing migration entries.
    1736             :  */
    1737           0 : static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
    1738             :                      unsigned long address, void *arg)
    1739             : {
    1740           0 :         struct mm_struct *mm = vma->vm_mm;
    1741           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    1742             :         pte_t pteval;
    1743             :         struct page *subpage;
    1744           0 :         bool ret = true;
    1745             :         struct mmu_notifier_range range;
    1746           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1747             : 
    1748             :         /*
    1749             :          * When racing against e.g. zap_pte_range() on another cpu,
    1750             :          * in between its ptep_get_and_clear_full() and page_remove_rmap(),
    1751             :          * try_to_migrate() may return before page_mapped() has become false,
    1752             :          * if page table locking is skipped: use TTU_SYNC to wait for that.
    1753             :          */
    1754           0 :         if (flags & TTU_SYNC)
    1755           0 :                 pvmw.flags = PVMW_SYNC;
    1756             : 
    1757             :         /*
    1758             :          * unmap_page() in mm/huge_memory.c is the only user of migration with
    1759             :          * TTU_SPLIT_HUGE_PMD and it wants to freeze.
    1760             :          */
    1761             :         if (flags & TTU_SPLIT_HUGE_PMD)
    1762             :                 split_huge_pmd_address(vma, address, true, folio);
    1763             : 
    1764             :         /*
    1765             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1766             :          * For hugetlb, it could be much worse if we need to do pud
    1767             :          * invalidation in the case of pmd sharing.
    1768             :          *
    1769             :          * Note that the page can not be free in this function as call of
    1770             :          * try_to_unmap() must hold a reference on the page.
    1771             :          */
    1772             :         range.end = vma_address_end(&pvmw);
    1773             :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
    1774             :                                 address, range.end);
    1775             :         if (folio_test_hugetlb(folio)) {
    1776             :                 /*
    1777             :                  * If sharing is possible, start and end will be adjusted
    1778             :                  * accordingly.
    1779             :                  */
    1780             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1781             :                                                      &range.end);
    1782             :         }
    1783             :         mmu_notifier_invalidate_range_start(&range);
    1784             : 
    1785           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1786             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    1787             :                 /* PMD-mapped THP migration entry */
    1788             :                 if (!pvmw.pte) {
    1789             :                         subpage = folio_page(folio,
    1790             :                                 pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
    1791             :                         VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
    1792             :                                         !folio_test_pmd_mappable(folio), folio);
    1793             : 
    1794             :                         set_pmd_migration_entry(&pvmw, subpage);
    1795             :                         continue;
    1796             :                 }
    1797             : #endif
    1798             : 
    1799             :                 /* Unexpected PMD-mapped THP? */
    1800             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    1801             : 
    1802           0 :                 subpage = folio_page(folio,
    1803             :                                 pte_pfn(*pvmw.pte) - folio_pfn(folio));
    1804           0 :                 address = pvmw.address;
    1805             : 
    1806           0 :                 if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
    1807             :                         /*
    1808             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1809             :                          * held in write mode.  Caller needs to explicitly
    1810             :                          * do this outside rmap routines.
    1811             :                          */
    1812             :                         VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1813             :                         if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
    1814             :                                 /*
    1815             :                                  * huge_pmd_unshare unmapped an entire PMD
    1816             :                                  * page.  There is no way of knowing exactly
    1817             :                                  * which PMDs may be cached for this mm, so
    1818             :                                  * we must flush them all.  start/end were
    1819             :                                  * already adjusted above to cover this range.
    1820             :                                  */
    1821             :                                 flush_cache_range(vma, range.start, range.end);
    1822             :                                 flush_tlb_range(vma, range.start, range.end);
    1823             :                                 mmu_notifier_invalidate_range(mm, range.start,
    1824             :                                                               range.end);
    1825             : 
    1826             :                                 /*
    1827             :                                  * The ref count of the PMD page was dropped
    1828             :                                  * which is part of the way map counting
    1829             :                                  * is done for shared PMDs.  Return 'true'
    1830             :                                  * here.  When there is no other sharing,
    1831             :                                  * huge_pmd_unshare returns false and we will
    1832             :                                  * unmap the actual page and drop map count
    1833             :                                  * to zero.
    1834             :                                  */
    1835             :                                 page_vma_mapped_walk_done(&pvmw);
    1836             :                                 break;
    1837             :                         }
    1838             :                 }
    1839             : 
    1840             :                 /* Nuke the page table entry. */
    1841           0 :                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1842           0 :                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1843             : 
    1844             :                 /* Set the dirty flag on the folio now the pte is gone. */
    1845           0 :                 if (pte_dirty(pteval))
    1846           0 :                         folio_mark_dirty(folio);
    1847             : 
    1848             :                 /* Update high watermark before we lower rss */
    1849           0 :                 update_hiwater_rss(mm);
    1850             : 
    1851           0 :                 if (folio_is_zone_device(folio)) {
    1852             :                         unsigned long pfn = folio_pfn(folio);
    1853             :                         swp_entry_t entry;
    1854             :                         pte_t swp_pte;
    1855             : 
    1856             :                         /*
    1857             :                          * Store the pfn of the page in a special migration
    1858             :                          * pte. do_swap_page() will wait until the migration
    1859             :                          * pte is removed and then restart fault handling.
    1860             :                          */
    1861             :                         entry = pte_to_swp_entry(pteval);
    1862             :                         if (is_writable_device_private_entry(entry))
    1863             :                                 entry = make_writable_migration_entry(pfn);
    1864             :                         else
    1865             :                                 entry = make_readable_migration_entry(pfn);
    1866             :                         swp_pte = swp_entry_to_pte(entry);
    1867             : 
    1868             :                         /*
    1869             :                          * pteval maps a zone device page and is therefore
    1870             :                          * a swap pte.
    1871             :                          */
    1872             :                         if (pte_swp_soft_dirty(pteval))
    1873             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1874             :                         if (pte_swp_uffd_wp(pteval))
    1875             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1876             :                         set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
    1877             :                         trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
    1878             :                                                 compound_order(&folio->page));
    1879             :                         /*
    1880             :                          * No need to invalidate here it will synchronize on
    1881             :                          * against the special swap migration pte.
    1882             :                          *
    1883             :                          * The assignment to subpage above was computed from a
    1884             :                          * swap PTE which results in an invalid pointer.
    1885             :                          * Since only PAGE_SIZE pages can currently be
    1886             :                          * migrated, just set it to page. This will need to be
    1887             :                          * changed when hugepage migrations to device private
    1888             :                          * memory are supported.
    1889             :                          */
    1890             :                         subpage = &folio->page;
    1891           0 :                 } else if (PageHWPoison(subpage)) {
    1892             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    1893             :                         if (folio_test_hugetlb(folio)) {
    1894             :                                 hugetlb_count_sub(folio_nr_pages(folio), mm);
    1895             :                                 set_huge_swap_pte_at(mm, address,
    1896             :                                                      pvmw.pte, pteval,
    1897             :                                                      vma_mmu_pagesize(vma));
    1898             :                         } else {
    1899             :                                 dec_mm_counter(mm, mm_counter(&folio->page));
    1900             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1901             :                         }
    1902             : 
    1903           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    1904             :                         /*
    1905             :                          * The guest indicated that the page content is of no
    1906             :                          * interest anymore. Simply discard the pte, vmscan
    1907             :                          * will take care of the rest.
    1908             :                          * A future reference will then fault in a new zero
    1909             :                          * page. When userfaultfd is active, we must not drop
    1910             :                          * this page though, as its main user (postcopy
    1911             :                          * migration) will not expect userfaults on already
    1912             :                          * copied pages.
    1913             :                          */
    1914             :                         dec_mm_counter(mm, mm_counter(&folio->page));
    1915             :                         /* We have to invalidate as we cleared the pte */
    1916             :                         mmu_notifier_invalidate_range(mm, address,
    1917             :                                                       address + PAGE_SIZE);
    1918             :                 } else {
    1919             :                         swp_entry_t entry;
    1920             :                         pte_t swp_pte;
    1921             : 
    1922           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    1923             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1924             :                                 ret = false;
    1925             :                                 page_vma_mapped_walk_done(&pvmw);
    1926             :                                 break;
    1927             :                         }
    1928             : 
    1929             :                         /*
    1930             :                          * Store the pfn of the page in a special migration
    1931             :                          * pte. do_swap_page() will wait until the migration
    1932             :                          * pte is removed and then restart fault handling.
    1933             :                          */
    1934           0 :                         if (pte_write(pteval))
    1935           0 :                                 entry = make_writable_migration_entry(
    1936           0 :                                                         page_to_pfn(subpage));
    1937             :                         else
    1938           0 :                                 entry = make_readable_migration_entry(
    1939           0 :                                                         page_to_pfn(subpage));
    1940             : 
    1941           0 :                         swp_pte = swp_entry_to_pte(entry);
    1942           0 :                         if (pte_soft_dirty(pteval))
    1943             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1944             :                         if (pte_uffd_wp(pteval))
    1945             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1946           0 :                         set_pte_at(mm, address, pvmw.pte, swp_pte);
    1947           0 :                         trace_set_migration_pte(address, pte_val(swp_pte),
    1948           0 :                                                 compound_order(&folio->page));
    1949             :                         /*
    1950             :                          * No need to invalidate here it will synchronize on
    1951             :                          * against the special swap migration pte.
    1952             :                          */
    1953             :                 }
    1954             : 
    1955             :                 /*
    1956             :                  * No need to call mmu_notifier_invalidate_range() it has be
    1957             :                  * done above for all cases requiring it to happen under page
    1958             :                  * table lock before mmu_notifier_invalidate_range_end()
    1959             :                  *
    1960             :                  * See Documentation/vm/mmu_notifier.rst
    1961             :                  */
    1962           0 :                 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
    1963           0 :                 if (vma->vm_flags & VM_LOCKED)
    1964           0 :                         mlock_page_drain_local();
    1965             :                 folio_put(folio);
    1966             :         }
    1967             : 
    1968           0 :         mmu_notifier_invalidate_range_end(&range);
    1969             : 
    1970           0 :         return ret;
    1971             : }
    1972             : 
    1973             : /**
    1974             :  * try_to_migrate - try to replace all page table mappings with swap entries
    1975             :  * @folio: the folio to replace page table entries for
    1976             :  * @flags: action and flags
    1977             :  *
    1978             :  * Tries to remove all the page table entries which are mapping this folio and
    1979             :  * replace them with special swap entries. Caller must hold the folio lock.
    1980             :  */
    1981           0 : void try_to_migrate(struct folio *folio, enum ttu_flags flags)
    1982             : {
    1983           0 :         struct rmap_walk_control rwc = {
    1984             :                 .rmap_one = try_to_migrate_one,
    1985           0 :                 .arg = (void *)flags,
    1986             :                 .done = page_not_mapped,
    1987             :                 .anon_lock = folio_lock_anon_vma_read,
    1988             :         };
    1989             : 
    1990             :         /*
    1991             :          * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
    1992             :          * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
    1993             :          */
    1994           0 :         if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
    1995             :                                         TTU_SYNC)))
    1996           0 :                 return;
    1997             : 
    1998           0 :         if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
    1999             :                 return;
    2000             : 
    2001             :         /*
    2002             :          * During exec, a temporary VMA is setup and later moved.
    2003             :          * The VMA is moved under the anon_vma lock but not the
    2004             :          * page tables leading to a race where migration cannot
    2005             :          * find the migration ptes. Rather than increasing the
    2006             :          * locking requirements of exec(), migration skips
    2007             :          * temporary VMAs until after exec() completes.
    2008             :          */
    2009           0 :         if (!folio_test_ksm(folio) && folio_test_anon(folio))
    2010           0 :                 rwc.invalid_vma = invalid_migration_vma;
    2011             : 
    2012           0 :         if (flags & TTU_RMAP_LOCKED)
    2013           0 :                 rmap_walk_locked(folio, &rwc);
    2014             :         else
    2015           0 :                 rmap_walk(folio, &rwc);
    2016             : }
    2017             : 
    2018             : #ifdef CONFIG_DEVICE_PRIVATE
    2019             : struct make_exclusive_args {
    2020             :         struct mm_struct *mm;
    2021             :         unsigned long address;
    2022             :         void *owner;
    2023             :         bool valid;
    2024             : };
    2025             : 
    2026             : static bool page_make_device_exclusive_one(struct folio *folio,
    2027             :                 struct vm_area_struct *vma, unsigned long address, void *priv)
    2028             : {
    2029             :         struct mm_struct *mm = vma->vm_mm;
    2030             :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    2031             :         struct make_exclusive_args *args = priv;
    2032             :         pte_t pteval;
    2033             :         struct page *subpage;
    2034             :         bool ret = true;
    2035             :         struct mmu_notifier_range range;
    2036             :         swp_entry_t entry;
    2037             :         pte_t swp_pte;
    2038             : 
    2039             :         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
    2040             :                                       vma->vm_mm, address, min(vma->vm_end,
    2041             :                                       address + folio_size(folio)),
    2042             :                                       args->owner);
    2043             :         mmu_notifier_invalidate_range_start(&range);
    2044             : 
    2045             :         while (page_vma_mapped_walk(&pvmw)) {
    2046             :                 /* Unexpected PMD-mapped THP? */
    2047             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    2048             : 
    2049             :                 if (!pte_present(*pvmw.pte)) {
    2050             :                         ret = false;
    2051             :                         page_vma_mapped_walk_done(&pvmw);
    2052             :                         break;
    2053             :                 }
    2054             : 
    2055             :                 subpage = folio_page(folio,
    2056             :                                 pte_pfn(*pvmw.pte) - folio_pfn(folio));
    2057             :                 address = pvmw.address;
    2058             : 
    2059             :                 /* Nuke the page table entry. */
    2060             :                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    2061             :                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    2062             : 
    2063             :                 /* Set the dirty flag on the folio now the pte is gone. */
    2064             :                 if (pte_dirty(pteval))
    2065             :                         folio_mark_dirty(folio);
    2066             : 
    2067             :                 /*
    2068             :                  * Check that our target page is still mapped at the expected
    2069             :                  * address.
    2070             :                  */
    2071             :                 if (args->mm == mm && args->address == address &&
    2072             :                     pte_write(pteval))
    2073             :                         args->valid = true;
    2074             : 
    2075             :                 /*
    2076             :                  * Store the pfn of the page in a special migration
    2077             :                  * pte. do_swap_page() will wait until the migration
    2078             :                  * pte is removed and then restart fault handling.
    2079             :                  */
    2080             :                 if (pte_write(pteval))
    2081             :                         entry = make_writable_device_exclusive_entry(
    2082             :                                                         page_to_pfn(subpage));
    2083             :                 else
    2084             :                         entry = make_readable_device_exclusive_entry(
    2085             :                                                         page_to_pfn(subpage));
    2086             :                 swp_pte = swp_entry_to_pte(entry);
    2087             :                 if (pte_soft_dirty(pteval))
    2088             :                         swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2089             :                 if (pte_uffd_wp(pteval))
    2090             :                         swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2091             : 
    2092             :                 set_pte_at(mm, address, pvmw.pte, swp_pte);
    2093             : 
    2094             :                 /*
    2095             :                  * There is a reference on the page for the swap entry which has
    2096             :                  * been removed, so shouldn't take another.
    2097             :                  */
    2098             :                 page_remove_rmap(subpage, vma, false);
    2099             :         }
    2100             : 
    2101             :         mmu_notifier_invalidate_range_end(&range);
    2102             : 
    2103             :         return ret;
    2104             : }
    2105             : 
    2106             : /**
    2107             :  * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
    2108             :  * @folio: The folio to replace page table entries for.
    2109             :  * @mm: The mm_struct where the folio is expected to be mapped.
    2110             :  * @address: Address where the folio is expected to be mapped.
    2111             :  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
    2112             :  *
    2113             :  * Tries to remove all the page table entries which are mapping this
    2114             :  * folio and replace them with special device exclusive swap entries to
    2115             :  * grant a device exclusive access to the folio.
    2116             :  *
    2117             :  * Context: Caller must hold the folio lock.
    2118             :  * Return: false if the page is still mapped, or if it could not be unmapped
    2119             :  * from the expected address. Otherwise returns true (success).
    2120             :  */
    2121             : static bool folio_make_device_exclusive(struct folio *folio,
    2122             :                 struct mm_struct *mm, unsigned long address, void *owner)
    2123             : {
    2124             :         struct make_exclusive_args args = {
    2125             :                 .mm = mm,
    2126             :                 .address = address,
    2127             :                 .owner = owner,
    2128             :                 .valid = false,
    2129             :         };
    2130             :         struct rmap_walk_control rwc = {
    2131             :                 .rmap_one = page_make_device_exclusive_one,
    2132             :                 .done = page_not_mapped,
    2133             :                 .anon_lock = folio_lock_anon_vma_read,
    2134             :                 .arg = &args,
    2135             :         };
    2136             : 
    2137             :         /*
    2138             :          * Restrict to anonymous folios for now to avoid potential writeback
    2139             :          * issues.
    2140             :          */
    2141             :         if (!folio_test_anon(folio))
    2142             :                 return false;
    2143             : 
    2144             :         rmap_walk(folio, &rwc);
    2145             : 
    2146             :         return args.valid && !folio_mapcount(folio);
    2147             : }
    2148             : 
    2149             : /**
    2150             :  * make_device_exclusive_range() - Mark a range for exclusive use by a device
    2151             :  * @mm: mm_struct of assoicated target process
    2152             :  * @start: start of the region to mark for exclusive device access
    2153             :  * @end: end address of region
    2154             :  * @pages: returns the pages which were successfully marked for exclusive access
    2155             :  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
    2156             :  *
    2157             :  * Returns: number of pages found in the range by GUP. A page is marked for
    2158             :  * exclusive access only if the page pointer is non-NULL.
    2159             :  *
    2160             :  * This function finds ptes mapping page(s) to the given address range, locks
    2161             :  * them and replaces mappings with special swap entries preventing userspace CPU
    2162             :  * access. On fault these entries are replaced with the original mapping after
    2163             :  * calling MMU notifiers.
    2164             :  *
    2165             :  * A driver using this to program access from a device must use a mmu notifier
    2166             :  * critical section to hold a device specific lock during programming. Once
    2167             :  * programming is complete it should drop the page lock and reference after
    2168             :  * which point CPU access to the page will revoke the exclusive access.
    2169             :  */
    2170             : int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
    2171             :                                 unsigned long end, struct page **pages,
    2172             :                                 void *owner)
    2173             : {
    2174             :         long npages = (end - start) >> PAGE_SHIFT;
    2175             :         long i;
    2176             : 
    2177             :         npages = get_user_pages_remote(mm, start, npages,
    2178             :                                        FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
    2179             :                                        pages, NULL, NULL);
    2180             :         if (npages < 0)
    2181             :                 return npages;
    2182             : 
    2183             :         for (i = 0; i < npages; i++, start += PAGE_SIZE) {
    2184             :                 struct folio *folio = page_folio(pages[i]);
    2185             :                 if (PageTail(pages[i]) || !folio_trylock(folio)) {
    2186             :                         folio_put(folio);
    2187             :                         pages[i] = NULL;
    2188             :                         continue;
    2189             :                 }
    2190             : 
    2191             :                 if (!folio_make_device_exclusive(folio, mm, start, owner)) {
    2192             :                         folio_unlock(folio);
    2193             :                         folio_put(folio);
    2194             :                         pages[i] = NULL;
    2195             :                 }
    2196             :         }
    2197             : 
    2198             :         return npages;
    2199             : }
    2200             : EXPORT_SYMBOL_GPL(make_device_exclusive_range);
    2201             : #endif
    2202             : 
    2203           0 : void __put_anon_vma(struct anon_vma *anon_vma)
    2204             : {
    2205           0 :         struct anon_vma *root = anon_vma->root;
    2206             : 
    2207           0 :         anon_vma_free(anon_vma);
    2208           0 :         if (root != anon_vma && atomic_dec_and_test(&root->refcount))
    2209           0 :                 anon_vma_free(root);
    2210           0 : }
    2211             : 
    2212           0 : static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
    2213             :                                         const struct rmap_walk_control *rwc)
    2214             : {
    2215             :         struct anon_vma *anon_vma;
    2216             : 
    2217           0 :         if (rwc->anon_lock)
    2218           0 :                 return rwc->anon_lock(folio);
    2219             : 
    2220             :         /*
    2221             :          * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
    2222             :          * because that depends on page_mapped(); but not all its usages
    2223             :          * are holding mmap_lock. Users without mmap_lock are required to
    2224             :          * take a reference count to prevent the anon_vma disappearing
    2225             :          */
    2226           0 :         anon_vma = folio_anon_vma(folio);
    2227           0 :         if (!anon_vma)
    2228             :                 return NULL;
    2229             : 
    2230           0 :         anon_vma_lock_read(anon_vma);
    2231             :         return anon_vma;
    2232             : }
    2233             : 
    2234             : /*
    2235             :  * rmap_walk_anon - do something to anonymous page using the object-based
    2236             :  * rmap method
    2237             :  * @page: the page to be handled
    2238             :  * @rwc: control variable according to each walk type
    2239             :  *
    2240             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    2241             :  * contained in the anon_vma struct it points to.
    2242             :  */
    2243           0 : static void rmap_walk_anon(struct folio *folio,
    2244             :                 const struct rmap_walk_control *rwc, bool locked)
    2245             : {
    2246             :         struct anon_vma *anon_vma;
    2247             :         pgoff_t pgoff_start, pgoff_end;
    2248             :         struct anon_vma_chain *avc;
    2249             : 
    2250           0 :         if (locked) {
    2251           0 :                 anon_vma = folio_anon_vma(folio);
    2252             :                 /* anon_vma disappear under us? */
    2253             :                 VM_BUG_ON_FOLIO(!anon_vma, folio);
    2254             :         } else {
    2255           0 :                 anon_vma = rmap_walk_anon_lock(folio, rwc);
    2256             :         }
    2257           0 :         if (!anon_vma)
    2258             :                 return;
    2259             : 
    2260           0 :         pgoff_start = folio_pgoff(folio);
    2261           0 :         pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
    2262           0 :         anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
    2263             :                         pgoff_start, pgoff_end) {
    2264           0 :                 struct vm_area_struct *vma = avc->vma;
    2265           0 :                 unsigned long address = vma_address(&folio->page, vma);
    2266             : 
    2267             :                 VM_BUG_ON_VMA(address == -EFAULT, vma);
    2268           0 :                 cond_resched();
    2269             : 
    2270           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    2271           0 :                         continue;
    2272             : 
    2273           0 :                 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
    2274             :                         break;
    2275           0 :                 if (rwc->done && rwc->done(folio))
    2276             :                         break;
    2277             :         }
    2278             : 
    2279           0 :         if (!locked)
    2280           0 :                 anon_vma_unlock_read(anon_vma);
    2281             : }
    2282             : 
    2283             : /*
    2284             :  * rmap_walk_file - do something to file page using the object-based rmap method
    2285             :  * @page: the page to be handled
    2286             :  * @rwc: control variable according to each walk type
    2287             :  *
    2288             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    2289             :  * contained in the address_space struct it points to.
    2290             :  */
    2291           0 : static void rmap_walk_file(struct folio *folio,
    2292             :                 const struct rmap_walk_control *rwc, bool locked)
    2293             : {
    2294           0 :         struct address_space *mapping = folio_mapping(folio);
    2295             :         pgoff_t pgoff_start, pgoff_end;
    2296             :         struct vm_area_struct *vma;
    2297             : 
    2298             :         /*
    2299             :          * The page lock not only makes sure that page->mapping cannot
    2300             :          * suddenly be NULLified by truncation, it makes sure that the
    2301             :          * structure at mapping cannot be freed and reused yet,
    2302             :          * so we can safely take mapping->i_mmap_rwsem.
    2303             :          */
    2304             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    2305             : 
    2306           0 :         if (!mapping)
    2307             :                 return;
    2308             : 
    2309           0 :         pgoff_start = folio_pgoff(folio);
    2310           0 :         pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
    2311           0 :         if (!locked)
    2312             :                 i_mmap_lock_read(mapping);
    2313           0 :         vma_interval_tree_foreach(vma, &mapping->i_mmap,
    2314             :                         pgoff_start, pgoff_end) {
    2315           0 :                 unsigned long address = vma_address(&folio->page, vma);
    2316             : 
    2317             :                 VM_BUG_ON_VMA(address == -EFAULT, vma);
    2318           0 :                 cond_resched();
    2319             : 
    2320           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    2321           0 :                         continue;
    2322             : 
    2323           0 :                 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
    2324             :                         goto done;
    2325           0 :                 if (rwc->done && rwc->done(folio))
    2326             :                         goto done;
    2327             :         }
    2328             : 
    2329             : done:
    2330           0 :         if (!locked)
    2331             :                 i_mmap_unlock_read(mapping);
    2332             : }
    2333             : 
    2334           0 : void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
    2335             : {
    2336           0 :         if (unlikely(folio_test_ksm(folio)))
    2337             :                 rmap_walk_ksm(folio, rwc);
    2338           0 :         else if (folio_test_anon(folio))
    2339           0 :                 rmap_walk_anon(folio, rwc, false);
    2340             :         else
    2341           0 :                 rmap_walk_file(folio, rwc, false);
    2342           0 : }
    2343             : 
    2344             : /* Like rmap_walk, but caller holds relevant rmap lock */
    2345           0 : void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
    2346             : {
    2347             :         /* no ksm support for now */
    2348             :         VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
    2349           0 :         if (folio_test_anon(folio))
    2350           0 :                 rmap_walk_anon(folio, rwc, true);
    2351             :         else
    2352           0 :                 rmap_walk_file(folio, rwc, true);
    2353           0 : }
    2354             : 
    2355             : #ifdef CONFIG_HUGETLB_PAGE
    2356             : /*
    2357             :  * The following two functions are for anonymous (private mapped) hugepages.
    2358             :  * Unlike common anonymous pages, anonymous hugepages have no accounting code
    2359             :  * and no lru code, because we handle hugepages differently from common pages.
    2360             :  */
    2361             : void hugepage_add_anon_rmap(struct page *page,
    2362             :                             struct vm_area_struct *vma, unsigned long address)
    2363             : {
    2364             :         struct anon_vma *anon_vma = vma->anon_vma;
    2365             :         int first;
    2366             : 
    2367             :         BUG_ON(!PageLocked(page));
    2368             :         BUG_ON(!anon_vma);
    2369             :         /* address might be in next vma when migration races vma_adjust */
    2370             :         first = atomic_inc_and_test(compound_mapcount_ptr(page));
    2371             :         if (first)
    2372             :                 __page_set_anon_rmap(page, vma, address, 0);
    2373             : }
    2374             : 
    2375             : void hugepage_add_new_anon_rmap(struct page *page,
    2376             :                         struct vm_area_struct *vma, unsigned long address)
    2377             : {
    2378             :         BUG_ON(address < vma->vm_start || address >= vma->vm_end);
    2379             :         atomic_set(compound_mapcount_ptr(page), 0);
    2380             :         atomic_set(compound_pincount_ptr(page), 0);
    2381             : 
    2382             :         __page_set_anon_rmap(page, vma, address, 1);
    2383             : }
    2384             : #endif /* CONFIG_HUGETLB_PAGE */

Generated by: LCOV version 1.14