LCOV - code coverage report
Current view: top level - mm - vmalloc.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 291 1055 27.6 %
Date: 2022-12-09 01:23:36 Functions: 20 91 22.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 1993  Linus Torvalds
       4             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       5             :  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
       6             :  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
       7             :  *  Numa awareness, Christoph Lameter, SGI, June 2005
       8             :  *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
       9             :  */
      10             : 
      11             : #include <linux/vmalloc.h>
      12             : #include <linux/mm.h>
      13             : #include <linux/module.h>
      14             : #include <linux/highmem.h>
      15             : #include <linux/sched/signal.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/spinlock.h>
      18             : #include <linux/interrupt.h>
      19             : #include <linux/proc_fs.h>
      20             : #include <linux/seq_file.h>
      21             : #include <linux/set_memory.h>
      22             : #include <linux/debugobjects.h>
      23             : #include <linux/kallsyms.h>
      24             : #include <linux/list.h>
      25             : #include <linux/notifier.h>
      26             : #include <linux/rbtree.h>
      27             : #include <linux/xarray.h>
      28             : #include <linux/io.h>
      29             : #include <linux/rcupdate.h>
      30             : #include <linux/pfn.h>
      31             : #include <linux/kmemleak.h>
      32             : #include <linux/atomic.h>
      33             : #include <linux/compiler.h>
      34             : #include <linux/memcontrol.h>
      35             : #include <linux/llist.h>
      36             : #include <linux/bitops.h>
      37             : #include <linux/rbtree_augmented.h>
      38             : #include <linux/overflow.h>
      39             : #include <linux/pgtable.h>
      40             : #include <linux/uaccess.h>
      41             : #include <linux/hugetlb.h>
      42             : #include <linux/sched/mm.h>
      43             : #include <asm/tlbflush.h>
      44             : #include <asm/shmparam.h>
      45             : 
      46             : #include "internal.h"
      47             : #include "pgalloc-track.h"
      48             : 
      49             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
      50             : static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
      51             : 
      52             : static int __init set_nohugeiomap(char *str)
      53             : {
      54             :         ioremap_max_page_shift = PAGE_SHIFT;
      55             :         return 0;
      56             : }
      57             : early_param("nohugeiomap", set_nohugeiomap);
      58             : #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      59             : static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
      60             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      61             : 
      62             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
      63             : static bool __ro_after_init vmap_allow_huge = true;
      64             : 
      65             : static int __init set_nohugevmalloc(char *str)
      66             : {
      67             :         vmap_allow_huge = false;
      68             :         return 0;
      69             : }
      70             : early_param("nohugevmalloc", set_nohugevmalloc);
      71             : #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      72             : static const bool vmap_allow_huge = false;
      73             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      74             : 
      75           0 : bool is_vmalloc_addr(const void *x)
      76             : {
      77           0 :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
      78             : 
      79           0 :         return addr >= VMALLOC_START && addr < VMALLOC_END;
      80             : }
      81             : EXPORT_SYMBOL(is_vmalloc_addr);
      82             : 
      83             : struct vfree_deferred {
      84             :         struct llist_head list;
      85             :         struct work_struct wq;
      86             : };
      87             : static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
      88             : 
      89             : static void __vunmap(const void *, int);
      90             : 
      91           0 : static void free_work(struct work_struct *w)
      92             : {
      93           0 :         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
      94             :         struct llist_node *t, *llnode;
      95             : 
      96           0 :         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
      97           0 :                 __vunmap((void *)llnode, 1);
      98           0 : }
      99             : 
     100             : /*** Page table manipulation functions ***/
     101           0 : static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
     102             :                         phys_addr_t phys_addr, pgprot_t prot,
     103             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     104             : {
     105             :         pte_t *pte;
     106             :         u64 pfn;
     107           0 :         unsigned long size = PAGE_SIZE;
     108             : 
     109           0 :         pfn = phys_addr >> PAGE_SHIFT;
     110           0 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     111           0 :         if (!pte)
     112             :                 return -ENOMEM;
     113             :         do {
     114           0 :                 BUG_ON(!pte_none(*pte));
     115             : 
     116             : #ifdef CONFIG_HUGETLB_PAGE
     117             :                 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
     118             :                 if (size != PAGE_SIZE) {
     119             :                         pte_t entry = pfn_pte(pfn, prot);
     120             : 
     121             :                         entry = arch_make_huge_pte(entry, ilog2(size), 0);
     122             :                         set_huge_pte_at(&init_mm, addr, pte, entry);
     123             :                         pfn += PFN_DOWN(size);
     124             :                         continue;
     125             :                 }
     126             : #endif
     127           0 :                 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
     128           0 :                 pfn++;
     129           0 :         } while (pte += PFN_DOWN(size), addr += size, addr != end);
     130           0 :         *mask |= PGTBL_PTE_MODIFIED;
     131             :         return 0;
     132             : }
     133             : 
     134             : static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
     135             :                         phys_addr_t phys_addr, pgprot_t prot,
     136             :                         unsigned int max_page_shift)
     137             : {
     138             :         if (max_page_shift < PMD_SHIFT)
     139             :                 return 0;
     140             : 
     141             :         if (!arch_vmap_pmd_supported(prot))
     142             :                 return 0;
     143             : 
     144             :         if ((end - addr) != PMD_SIZE)
     145             :                 return 0;
     146             : 
     147             :         if (!IS_ALIGNED(addr, PMD_SIZE))
     148             :                 return 0;
     149             : 
     150             :         if (!IS_ALIGNED(phys_addr, PMD_SIZE))
     151             :                 return 0;
     152             : 
     153             :         if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
     154             :                 return 0;
     155             : 
     156             :         return pmd_set_huge(pmd, phys_addr, prot);
     157             : }
     158             : 
     159           0 : static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     160             :                         phys_addr_t phys_addr, pgprot_t prot,
     161             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     162             : {
     163             :         pmd_t *pmd;
     164             :         unsigned long next;
     165             : 
     166           0 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     167           0 :         if (!pmd)
     168             :                 return -ENOMEM;
     169             :         do {
     170           0 :                 next = pmd_addr_end(addr, end);
     171             : 
     172           0 :                 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
     173             :                                         max_page_shift)) {
     174             :                         *mask |= PGTBL_PMD_MODIFIED;
     175             :                         continue;
     176             :                 }
     177             : 
     178           0 :                 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
     179             :                         return -ENOMEM;
     180           0 :         } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
     181             :         return 0;
     182             : }
     183             : 
     184             : static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
     185             :                         phys_addr_t phys_addr, pgprot_t prot,
     186             :                         unsigned int max_page_shift)
     187             : {
     188             :         if (max_page_shift < PUD_SHIFT)
     189             :                 return 0;
     190             : 
     191             :         if (!arch_vmap_pud_supported(prot))
     192             :                 return 0;
     193             : 
     194             :         if ((end - addr) != PUD_SIZE)
     195             :                 return 0;
     196             : 
     197             :         if (!IS_ALIGNED(addr, PUD_SIZE))
     198             :                 return 0;
     199             : 
     200             :         if (!IS_ALIGNED(phys_addr, PUD_SIZE))
     201             :                 return 0;
     202             : 
     203             :         if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
     204             :                 return 0;
     205             : 
     206             :         return pud_set_huge(pud, phys_addr, prot);
     207             : }
     208             : 
     209             : static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     210             :                         phys_addr_t phys_addr, pgprot_t prot,
     211             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     212             : {
     213             :         pud_t *pud;
     214             :         unsigned long next;
     215             : 
     216           0 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     217             :         if (!pud)
     218             :                 return -ENOMEM;
     219             :         do {
     220           0 :                 next = pud_addr_end(addr, end);
     221             : 
     222           0 :                 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
     223             :                                         max_page_shift)) {
     224             :                         *mask |= PGTBL_PUD_MODIFIED;
     225             :                         continue;
     226             :                 }
     227             : 
     228           0 :                 if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
     229             :                                         max_page_shift, mask))
     230             :                         return -ENOMEM;
     231           0 :         } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
     232             :         return 0;
     233             : }
     234             : 
     235             : static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
     236             :                         phys_addr_t phys_addr, pgprot_t prot,
     237             :                         unsigned int max_page_shift)
     238             : {
     239             :         if (max_page_shift < P4D_SHIFT)
     240             :                 return 0;
     241             : 
     242             :         if (!arch_vmap_p4d_supported(prot))
     243             :                 return 0;
     244             : 
     245             :         if ((end - addr) != P4D_SIZE)
     246             :                 return 0;
     247             : 
     248             :         if (!IS_ALIGNED(addr, P4D_SIZE))
     249             :                 return 0;
     250             : 
     251             :         if (!IS_ALIGNED(phys_addr, P4D_SIZE))
     252             :                 return 0;
     253             : 
     254             :         if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
     255             :                 return 0;
     256             : 
     257             :         return p4d_set_huge(p4d, phys_addr, prot);
     258             : }
     259             : 
     260           0 : static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     261             :                         phys_addr_t phys_addr, pgprot_t prot,
     262             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     263             : {
     264             :         p4d_t *p4d;
     265             :         unsigned long next;
     266             : 
     267           0 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     268           0 :         if (!p4d)
     269             :                 return -ENOMEM;
     270             :         do {
     271           0 :                 next = p4d_addr_end(addr, end);
     272             : 
     273           0 :                 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
     274             :                                         max_page_shift)) {
     275             :                         *mask |= PGTBL_P4D_MODIFIED;
     276             :                         continue;
     277             :                 }
     278             : 
     279           0 :                 if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
     280             :                                         max_page_shift, mask))
     281             :                         return -ENOMEM;
     282           0 :         } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
     283             :         return 0;
     284             : }
     285             : 
     286           0 : static int vmap_range_noflush(unsigned long addr, unsigned long end,
     287             :                         phys_addr_t phys_addr, pgprot_t prot,
     288             :                         unsigned int max_page_shift)
     289             : {
     290             :         pgd_t *pgd;
     291             :         unsigned long start;
     292             :         unsigned long next;
     293             :         int err;
     294           0 :         pgtbl_mod_mask mask = 0;
     295             : 
     296             :         might_sleep();
     297           0 :         BUG_ON(addr >= end);
     298             : 
     299           0 :         start = addr;
     300           0 :         pgd = pgd_offset_k(addr);
     301             :         do {
     302           0 :                 next = pgd_addr_end(addr, end);
     303           0 :                 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
     304             :                                         max_page_shift, &mask);
     305           0 :                 if (err)
     306             :                         break;
     307           0 :         } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
     308             : 
     309             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     310             :                 arch_sync_kernel_mappings(start, end);
     311             : 
     312           0 :         return err;
     313             : }
     314             : 
     315           0 : int ioremap_page_range(unsigned long addr, unsigned long end,
     316             :                 phys_addr_t phys_addr, pgprot_t prot)
     317             : {
     318             :         int err;
     319             : 
     320           0 :         err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
     321             :                                  ioremap_max_page_shift);
     322           0 :         flush_cache_vmap(addr, end);
     323           0 :         return err;
     324             : }
     325             : 
     326           0 : static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
     327             :                              pgtbl_mod_mask *mask)
     328             : {
     329             :         pte_t *pte;
     330             : 
     331           0 :         pte = pte_offset_kernel(pmd, addr);
     332             :         do {
     333           0 :                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
     334           0 :                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
     335           0 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     336           0 :         *mask |= PGTBL_PTE_MODIFIED;
     337           0 : }
     338             : 
     339           0 : static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     340             :                              pgtbl_mod_mask *mask)
     341             : {
     342             :         pmd_t *pmd;
     343             :         unsigned long next;
     344             :         int cleared;
     345             : 
     346           0 :         pmd = pmd_offset(pud, addr);
     347             :         do {
     348           0 :                 next = pmd_addr_end(addr, end);
     349             : 
     350           0 :                 cleared = pmd_clear_huge(pmd);
     351           0 :                 if (cleared || pmd_bad(*pmd))
     352           0 :                         *mask |= PGTBL_PMD_MODIFIED;
     353             : 
     354             :                 if (cleared)
     355             :                         continue;
     356           0 :                 if (pmd_none_or_clear_bad(pmd))
     357           0 :                         continue;
     358           0 :                 vunmap_pte_range(pmd, addr, next, mask);
     359             : 
     360           0 :                 cond_resched();
     361           0 :         } while (pmd++, addr = next, addr != end);
     362           0 : }
     363             : 
     364           0 : static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     365             :                              pgtbl_mod_mask *mask)
     366             : {
     367             :         pud_t *pud;
     368             :         unsigned long next;
     369             :         int cleared;
     370             : 
     371           0 :         pud = pud_offset(p4d, addr);
     372             :         do {
     373           0 :                 next = pud_addr_end(addr, end);
     374             : 
     375           0 :                 cleared = pud_clear_huge(pud);
     376           0 :                 if (cleared || pud_bad(*pud))
     377           0 :                         *mask |= PGTBL_PUD_MODIFIED;
     378             : 
     379             :                 if (cleared)
     380             :                         continue;
     381           0 :                 if (pud_none_or_clear_bad(pud))
     382           0 :                         continue;
     383           0 :                 vunmap_pmd_range(pud, addr, next, mask);
     384           0 :         } while (pud++, addr = next, addr != end);
     385           0 : }
     386             : 
     387             : static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     388             :                              pgtbl_mod_mask *mask)
     389             : {
     390             :         p4d_t *p4d;
     391             :         unsigned long next;
     392             :         int cleared;
     393             : 
     394           0 :         p4d = p4d_offset(pgd, addr);
     395             :         do {
     396           0 :                 next = p4d_addr_end(addr, end);
     397             : 
     398           0 :                 cleared = p4d_clear_huge(p4d);
     399           0 :                 if (cleared || p4d_bad(*p4d))
     400             :                         *mask |= PGTBL_P4D_MODIFIED;
     401             : 
     402             :                 if (cleared)
     403             :                         continue;
     404           0 :                 if (p4d_none_or_clear_bad(p4d))
     405             :                         continue;
     406           0 :                 vunmap_pud_range(p4d, addr, next, mask);
     407           0 :         } while (p4d++, addr = next, addr != end);
     408             : }
     409             : 
     410             : /*
     411             :  * vunmap_range_noflush is similar to vunmap_range, but does not
     412             :  * flush caches or TLBs.
     413             :  *
     414             :  * The caller is responsible for calling flush_cache_vmap() before calling
     415             :  * this function, and flush_tlb_kernel_range after it has returned
     416             :  * successfully (and before the addresses are expected to cause a page fault
     417             :  * or be re-mapped for something else, if TLB flushes are being delayed or
     418             :  * coalesced).
     419             :  *
     420             :  * This is an internal function only. Do not use outside mm/.
     421             :  */
     422           0 : void vunmap_range_noflush(unsigned long start, unsigned long end)
     423             : {
     424             :         unsigned long next;
     425             :         pgd_t *pgd;
     426           0 :         unsigned long addr = start;
     427           0 :         pgtbl_mod_mask mask = 0;
     428             : 
     429           0 :         BUG_ON(addr >= end);
     430           0 :         pgd = pgd_offset_k(addr);
     431             :         do {
     432           0 :                 next = pgd_addr_end(addr, end);
     433           0 :                 if (pgd_bad(*pgd))
     434             :                         mask |= PGTBL_PGD_MODIFIED;
     435           0 :                 if (pgd_none_or_clear_bad(pgd))
     436             :                         continue;
     437             :                 vunmap_p4d_range(pgd, addr, next, &mask);
     438           0 :         } while (pgd++, addr = next, addr != end);
     439             : 
     440             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     441             :                 arch_sync_kernel_mappings(start, end);
     442           0 : }
     443             : 
     444             : /**
     445             :  * vunmap_range - unmap kernel virtual addresses
     446             :  * @addr: start of the VM area to unmap
     447             :  * @end: end of the VM area to unmap (non-inclusive)
     448             :  *
     449             :  * Clears any present PTEs in the virtual address range, flushes TLBs and
     450             :  * caches. Any subsequent access to the address before it has been re-mapped
     451             :  * is a kernel bug.
     452             :  */
     453           0 : void vunmap_range(unsigned long addr, unsigned long end)
     454             : {
     455           0 :         flush_cache_vunmap(addr, end);
     456           0 :         vunmap_range_noflush(addr, end);
     457           0 :         flush_tlb_kernel_range(addr, end);
     458           0 : }
     459             : 
     460          15 : static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
     461             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     462             :                 pgtbl_mod_mask *mask)
     463             : {
     464             :         pte_t *pte;
     465             : 
     466             :         /*
     467             :          * nr is a running index into the array which helps higher level
     468             :          * callers keep track of where we're up to.
     469             :          */
     470             : 
     471          30 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     472          15 :         if (!pte)
     473             :                 return -ENOMEM;
     474             :         do {
     475          60 :                 struct page *page = pages[*nr];
     476             : 
     477          60 :                 if (WARN_ON(!pte_none(*pte)))
     478             :                         return -EBUSY;
     479          60 :                 if (WARN_ON(!page))
     480             :                         return -ENOMEM;
     481         120 :                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
     482          60 :                 (*nr)++;
     483          60 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     484          15 :         *mask |= PGTBL_PTE_MODIFIED;
     485          15 :         return 0;
     486             : }
     487             : 
     488          15 : static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
     489             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     490             :                 pgtbl_mod_mask *mask)
     491             : {
     492             :         pmd_t *pmd;
     493             :         unsigned long next;
     494             : 
     495          15 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     496          15 :         if (!pmd)
     497             :                 return -ENOMEM;
     498             :         do {
     499          15 :                 next = pmd_addr_end(addr, end);
     500          15 :                 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
     501             :                         return -ENOMEM;
     502          15 :         } while (pmd++, addr = next, addr != end);
     503             :         return 0;
     504             : }
     505             : 
     506             : static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
     507             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     508             :                 pgtbl_mod_mask *mask)
     509             : {
     510             :         pud_t *pud;
     511             :         unsigned long next;
     512             : 
     513          30 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     514             :         if (!pud)
     515             :                 return -ENOMEM;
     516             :         do {
     517          15 :                 next = pud_addr_end(addr, end);
     518          15 :                 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
     519             :                         return -ENOMEM;
     520          15 :         } while (pud++, addr = next, addr != end);
     521             :         return 0;
     522             : }
     523             : 
     524          15 : static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
     525             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     526             :                 pgtbl_mod_mask *mask)
     527             : {
     528             :         p4d_t *p4d;
     529             :         unsigned long next;
     530             : 
     531          30 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     532          15 :         if (!p4d)
     533             :                 return -ENOMEM;
     534             :         do {
     535          15 :                 next = p4d_addr_end(addr, end);
     536          15 :                 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
     537             :                         return -ENOMEM;
     538          15 :         } while (p4d++, addr = next, addr != end);
     539          15 :         return 0;
     540             : }
     541             : 
     542          15 : static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
     543             :                 pgprot_t prot, struct page **pages)
     544             : {
     545          15 :         unsigned long start = addr;
     546             :         pgd_t *pgd;
     547             :         unsigned long next;
     548          15 :         int err = 0;
     549          15 :         int nr = 0;
     550          15 :         pgtbl_mod_mask mask = 0;
     551             : 
     552          15 :         BUG_ON(addr >= end);
     553          30 :         pgd = pgd_offset_k(addr);
     554             :         do {
     555          15 :                 next = pgd_addr_end(addr, end);
     556          15 :                 if (pgd_bad(*pgd))
     557             :                         mask |= PGTBL_PGD_MODIFIED;
     558          15 :                 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
     559          15 :                 if (err)
     560             :                         return err;
     561          15 :         } while (pgd++, addr = next, addr != end);
     562             : 
     563             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     564             :                 arch_sync_kernel_mappings(start, end);
     565             : 
     566             :         return 0;
     567             : }
     568             : 
     569             : /*
     570             :  * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
     571             :  * flush caches.
     572             :  *
     573             :  * The caller is responsible for calling flush_cache_vmap() after this
     574             :  * function returns successfully and before the addresses are accessed.
     575             :  *
     576             :  * This is an internal function only. Do not use outside mm/.
     577             :  */
     578          15 : int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     579             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     580             : {
     581          15 :         unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
     582             : 
     583          15 :         WARN_ON(page_shift < PAGE_SHIFT);
     584             : 
     585             :         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
     586             :                         page_shift == PAGE_SHIFT)
     587          15 :                 return vmap_small_pages_range_noflush(addr, end, prot, pages);
     588             : 
     589             :         for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
     590             :                 int err;
     591             : 
     592             :                 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
     593             :                                         __pa(page_address(pages[i])), prot,
     594             :                                         page_shift);
     595             :                 if (err)
     596             :                         return err;
     597             : 
     598             :                 addr += 1UL << page_shift;
     599             :         }
     600             : 
     601             :         return 0;
     602             : }
     603             : 
     604             : /**
     605             :  * vmap_pages_range - map pages to a kernel virtual address
     606             :  * @addr: start of the VM area to map
     607             :  * @end: end of the VM area to map (non-inclusive)
     608             :  * @prot: page protection flags to use
     609             :  * @pages: pages to map (always PAGE_SIZE pages)
     610             :  * @page_shift: maximum shift that the pages may be mapped with, @pages must
     611             :  * be aligned and contiguous up to at least this shift.
     612             :  *
     613             :  * RETURNS:
     614             :  * 0 on success, -errno on failure.
     615             :  */
     616             : static int vmap_pages_range(unsigned long addr, unsigned long end,
     617             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     618             : {
     619             :         int err;
     620             : 
     621          15 :         err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     622          15 :         flush_cache_vmap(addr, end);
     623             :         return err;
     624             : }
     625             : 
     626           0 : int is_vmalloc_or_module_addr(const void *x)
     627             : {
     628             :         /*
     629             :          * ARM, x86-64 and sparc64 put modules in a special place,
     630             :          * and fall back on vmalloc() if that fails. Others
     631             :          * just put it in the vmalloc space.
     632             :          */
     633             : #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
     634             :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
     635             :         if (addr >= MODULES_VADDR && addr < MODULES_END)
     636             :                 return 1;
     637             : #endif
     638           0 :         return is_vmalloc_addr(x);
     639             : }
     640             : 
     641             : /*
     642             :  * Walk a vmap address to the struct page it maps. Huge vmap mappings will
     643             :  * return the tail page that corresponds to the base page address, which
     644             :  * matches small vmap mappings.
     645             :  */
     646           0 : struct page *vmalloc_to_page(const void *vmalloc_addr)
     647             : {
     648           0 :         unsigned long addr = (unsigned long) vmalloc_addr;
     649           0 :         struct page *page = NULL;
     650           0 :         pgd_t *pgd = pgd_offset_k(addr);
     651             :         p4d_t *p4d;
     652             :         pud_t *pud;
     653             :         pmd_t *pmd;
     654             :         pte_t *ptep, pte;
     655             : 
     656             :         /*
     657             :          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
     658             :          * architectures that do not vmalloc module space
     659             :          */
     660             :         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
     661             : 
     662             :         if (pgd_none(*pgd))
     663             :                 return NULL;
     664           0 :         if (WARN_ON_ONCE(pgd_leaf(*pgd)))
     665             :                 return NULL; /* XXX: no allowance for huge pgd */
     666           0 :         if (WARN_ON_ONCE(pgd_bad(*pgd)))
     667             :                 return NULL;
     668             : 
     669           0 :         p4d = p4d_offset(pgd, addr);
     670             :         if (p4d_none(*p4d))
     671             :                 return NULL;
     672             :         if (p4d_leaf(*p4d))
     673             :                 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
     674           0 :         if (WARN_ON_ONCE(p4d_bad(*p4d)))
     675             :                 return NULL;
     676             : 
     677           0 :         pud = pud_offset(p4d, addr);
     678           0 :         if (pud_none(*pud))
     679             :                 return NULL;
     680             :         if (pud_leaf(*pud))
     681             :                 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
     682           0 :         if (WARN_ON_ONCE(pud_bad(*pud)))
     683             :                 return NULL;
     684             : 
     685           0 :         pmd = pmd_offset(pud, addr);
     686           0 :         if (pmd_none(*pmd))
     687             :                 return NULL;
     688             :         if (pmd_leaf(*pmd))
     689             :                 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
     690           0 :         if (WARN_ON_ONCE(pmd_bad(*pmd)))
     691             :                 return NULL;
     692             : 
     693           0 :         ptep = pte_offset_map(pmd, addr);
     694           0 :         pte = *ptep;
     695           0 :         if (pte_present(pte))
     696           0 :                 page = pte_page(pte);
     697             :         pte_unmap(ptep);
     698             : 
     699             :         return page;
     700             : }
     701             : EXPORT_SYMBOL(vmalloc_to_page);
     702             : 
     703             : /*
     704             :  * Map a vmalloc()-space virtual address to the physical page frame number.
     705             :  */
     706           0 : unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
     707             : {
     708           0 :         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
     709             : }
     710             : EXPORT_SYMBOL(vmalloc_to_pfn);
     711             : 
     712             : 
     713             : /*** Global kva allocator ***/
     714             : 
     715             : #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
     716             : #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
     717             : 
     718             : 
     719             : static DEFINE_SPINLOCK(vmap_area_lock);
     720             : static DEFINE_SPINLOCK(free_vmap_area_lock);
     721             : /* Export for kexec only */
     722             : LIST_HEAD(vmap_area_list);
     723             : static struct rb_root vmap_area_root = RB_ROOT;
     724             : static bool vmap_initialized __read_mostly;
     725             : 
     726             : static struct rb_root purge_vmap_area_root = RB_ROOT;
     727             : static LIST_HEAD(purge_vmap_area_list);
     728             : static DEFINE_SPINLOCK(purge_vmap_area_lock);
     729             : 
     730             : /*
     731             :  * This kmem_cache is used for vmap_area objects. Instead of
     732             :  * allocating from slab we reuse an object from this cache to
     733             :  * make things faster. Especially in "no edge" splitting of
     734             :  * free block.
     735             :  */
     736             : static struct kmem_cache *vmap_area_cachep;
     737             : 
     738             : /*
     739             :  * This linked list is used in pair with free_vmap_area_root.
     740             :  * It gives O(1) access to prev/next to perform fast coalescing.
     741             :  */
     742             : static LIST_HEAD(free_vmap_area_list);
     743             : 
     744             : /*
     745             :  * This augment red-black tree represents the free vmap space.
     746             :  * All vmap_area objects in this tree are sorted by va->va_start
     747             :  * address. It is used for allocation and merging when a vmap
     748             :  * object is released.
     749             :  *
     750             :  * Each vmap_area node contains a maximum available free block
     751             :  * of its sub-tree, right or left. Therefore it is possible to
     752             :  * find a lowest match of free area.
     753             :  */
     754             : static struct rb_root free_vmap_area_root = RB_ROOT;
     755             : 
     756             : /*
     757             :  * Preload a CPU with one object for "no edge" split case. The
     758             :  * aim is to get rid of allocations from the atomic context, thus
     759             :  * to use more permissive allocation masks.
     760             :  */
     761             : static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
     762             : 
     763             : static __always_inline unsigned long
     764             : va_size(struct vmap_area *va)
     765             : {
     766          99 :         return (va->va_end - va->va_start);
     767             : }
     768             : 
     769             : static __always_inline unsigned long
     770             : get_subtree_max_size(struct rb_node *node)
     771             : {
     772             :         struct vmap_area *va;
     773             : 
     774         133 :         va = rb_entry_safe(node, struct vmap_area, rb_node);
     775         133 :         return va ? va->subtree_max_size : 0;
     776             : }
     777             : 
     778         147 : RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
     779             :         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
     780             : 
     781             : static void purge_vmap_area_lazy(void);
     782             : static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
     783             : static void drain_vmap_area_work(struct work_struct *work);
     784             : static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
     785             : 
     786             : static atomic_long_t nr_vmalloc_pages;
     787             : 
     788           0 : unsigned long vmalloc_nr_pages(void)
     789             : {
     790           0 :         return atomic_long_read(&nr_vmalloc_pages);
     791             : }
     792             : 
     793             : static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
     794             : {
     795           0 :         struct vmap_area *va = NULL;
     796           0 :         struct rb_node *n = vmap_area_root.rb_node;
     797             : 
     798           0 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     799             : 
     800           0 :         while (n) {
     801             :                 struct vmap_area *tmp;
     802             : 
     803           0 :                 tmp = rb_entry(n, struct vmap_area, rb_node);
     804           0 :                 if (tmp->va_end > addr) {
     805           0 :                         va = tmp;
     806           0 :                         if (tmp->va_start <= addr)
     807             :                                 break;
     808             : 
     809           0 :                         n = n->rb_left;
     810             :                 } else
     811           0 :                         n = n->rb_right;
     812             :         }
     813             : 
     814             :         return va;
     815             : }
     816             : 
     817             : static struct vmap_area *__find_vmap_area(unsigned long addr)
     818             : {
     819          15 :         struct rb_node *n = vmap_area_root.rb_node;
     820             : 
     821          15 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     822             : 
     823          59 :         while (n) {
     824             :                 struct vmap_area *va;
     825             : 
     826          59 :                 va = rb_entry(n, struct vmap_area, rb_node);
     827          59 :                 if (addr < va->va_start)
     828           0 :                         n = n->rb_left;
     829          59 :                 else if (addr >= va->va_end)
     830          44 :                         n = n->rb_right;
     831             :                 else
     832             :                         return va;
     833             :         }
     834             : 
     835             :         return NULL;
     836             : }
     837             : 
     838             : /*
     839             :  * This function returns back addresses of parent node
     840             :  * and its left or right link for further processing.
     841             :  *
     842             :  * Otherwise NULL is returned. In that case all further
     843             :  * steps regarding inserting of conflicting overlap range
     844             :  * have to be declined and actually considered as a bug.
     845             :  */
     846             : static __always_inline struct rb_node **
     847             : find_va_links(struct vmap_area *va,
     848             :         struct rb_root *root, struct rb_node *from,
     849             :         struct rb_node **parent)
     850             : {
     851             :         struct vmap_area *tmp_va;
     852             :         struct rb_node **link;
     853             : 
     854          16 :         if (root) {
     855          16 :                 link = &root->rb_node;
     856          16 :                 if (unlikely(!*link)) {
     857             :                         *parent = NULL;
     858             :                         return link;
     859             :                 }
     860             :         } else {
     861             :                 link = &from;
     862             :         }
     863             : 
     864             :         /*
     865             :          * Go to the bottom of the tree. When we hit the last point
     866             :          * we end up with parent rb_node and correct direction, i name
     867             :          * it link, where the new va->rb_node will be attached to.
     868             :          */
     869             :         do {
     870          75 :                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
     871             : 
     872             :                 /*
     873             :                  * During the traversal we also do some sanity check.
     874             :                  * Trigger the BUG() if there are sides(left/right)
     875             :                  * or full overlaps.
     876             :                  */
     877          90 :                 if (va->va_start < tmp_va->va_end &&
     878          15 :                                 va->va_end <= tmp_va->va_start)
     879          15 :                         link = &(*link)->rb_left;
     880          60 :                 else if (va->va_end > tmp_va->va_start &&
     881             :                                 va->va_start >= tmp_va->va_end)
     882          60 :                         link = &(*link)->rb_right;
     883             :                 else {
     884           0 :                         WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
     885             :                                 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
     886             : 
     887             :                         return NULL;
     888             :                 }
     889          75 :         } while (*link);
     890             : 
     891          29 :         *parent = &tmp_va->rb_node;
     892             :         return link;
     893             : }
     894             : 
     895             : static __always_inline struct list_head *
     896             : get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
     897             : {
     898             :         struct list_head *list;
     899             : 
     900           0 :         if (unlikely(!parent))
     901             :                 /*
     902             :                  * The red-black tree where we try to find VA neighbors
     903             :                  * before merging or inserting is empty, i.e. it means
     904             :                  * there is no free vmap space. Normally it does not
     905             :                  * happen but we handle this case anyway.
     906             :                  */
     907             :                 return NULL;
     908             : 
     909           0 :         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
     910           0 :         return (&parent->rb_right == link ? list->next : list);
     911             : }
     912             : 
     913             : static __always_inline void
     914             : link_va(struct vmap_area *va, struct rb_root *root,
     915             :         struct rb_node *parent, struct rb_node **link, struct list_head *head)
     916             : {
     917             :         /*
     918             :          * VA is still not in the list, but we can
     919             :          * identify its future previous list_head node.
     920             :          */
     921          31 :         if (likely(parent)) {
     922          29 :                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
     923          29 :                 if (&parent->rb_right != link)
     924           8 :                         head = head->prev;
     925             :         }
     926             : 
     927             :         /* Insert to the rb-tree */
     928          62 :         rb_link_node(&va->rb_node, parent, link);
     929          31 :         if (root == &free_vmap_area_root) {
     930             :                 /*
     931             :                  * Some explanation here. Just perform simple insertion
     932             :                  * to the tree. We do not set va->subtree_max_size to
     933             :                  * its current size before calling rb_insert_augmented().
     934             :                  * It is because of we populate the tree from the bottom
     935             :                  * to parent levels when the node _is_ in the tree.
     936             :                  *
     937             :                  * Therefore we set subtree_max_size to zero after insertion,
     938             :                  * to let __augment_tree_propagate_from() puts everything to
     939             :                  * the correct order later on.
     940             :                  */
     941          16 :                 rb_insert_augmented(&va->rb_node,
     942             :                         root, &free_vmap_area_rb_augment_cb);
     943          16 :                 va->subtree_max_size = 0;
     944             :         } else {
     945          15 :                 rb_insert_color(&va->rb_node, root);
     946             :         }
     947             : 
     948             :         /* Address-sort this list */
     949          47 :         list_add(&va->list, head);
     950             : }
     951             : 
     952             : static __always_inline void
     953             : unlink_va(struct vmap_area *va, struct rb_root *root)
     954             : {
     955           0 :         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
     956             :                 return;
     957             : 
     958             :         if (root == &free_vmap_area_root)
     959           0 :                 rb_erase_augmented(&va->rb_node,
     960             :                         root, &free_vmap_area_rb_augment_cb);
     961             :         else
     962           0 :                 rb_erase(&va->rb_node, root);
     963             : 
     964           0 :         list_del(&va->list);
     965           0 :         RB_CLEAR_NODE(&va->rb_node);
     966             : }
     967             : 
     968             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
     969             : /*
     970             :  * Gets called when remove the node and rotate.
     971             :  */
     972             : static __always_inline unsigned long
     973             : compute_subtree_max_size(struct vmap_area *va)
     974             : {
     975             :         return max3(va_size(va),
     976             :                 get_subtree_max_size(va->rb_node.rb_left),
     977             :                 get_subtree_max_size(va->rb_node.rb_right));
     978             : }
     979             : 
     980             : static void
     981             : augment_tree_propagate_check(void)
     982             : {
     983             :         struct vmap_area *va;
     984             :         unsigned long computed_size;
     985             : 
     986             :         list_for_each_entry(va, &free_vmap_area_list, list) {
     987             :                 computed_size = compute_subtree_max_size(va);
     988             :                 if (computed_size != va->subtree_max_size)
     989             :                         pr_emerg("tree is corrupted: %lu, %lu\n",
     990             :                                 va_size(va), va->subtree_max_size);
     991             :         }
     992             : }
     993             : #endif
     994             : 
     995             : /*
     996             :  * This function populates subtree_max_size from bottom to upper
     997             :  * levels starting from VA point. The propagation must be done
     998             :  * when VA size is modified by changing its va_start/va_end. Or
     999             :  * in case of newly inserting of VA to the tree.
    1000             :  *
    1001             :  * It means that __augment_tree_propagate_from() must be called:
    1002             :  * - After VA has been inserted to the tree(free path);
    1003             :  * - After VA has been shrunk(allocation path);
    1004             :  * - After VA has been increased(merging path).
    1005             :  *
    1006             :  * Please note that, it does not mean that upper parent nodes
    1007             :  * and their subtree_max_size are recalculated all the time up
    1008             :  * to the root node.
    1009             :  *
    1010             :  *       4--8
    1011             :  *        /\
    1012             :  *       /  \
    1013             :  *      /    \
    1014             :  *    2--2  8--8
    1015             :  *
    1016             :  * For example if we modify the node 4, shrinking it to 2, then
    1017             :  * no any modification is required. If we shrink the node 2 to 1
    1018             :  * its subtree_max_size is updated only, and set to 1. If we shrink
    1019             :  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
    1020             :  * node becomes 4--6.
    1021             :  */
    1022             : static __always_inline void
    1023             : augment_tree_propagate_from(struct vmap_area *va)
    1024             : {
    1025             :         /*
    1026             :          * Populate the tree from bottom towards the root until
    1027             :          * the calculated maximum available size of checked node
    1028             :          * is equal to its current one.
    1029             :          */
    1030          31 :         free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
    1031             : 
    1032             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1033             :         augment_tree_propagate_check();
    1034             : #endif
    1035             : }
    1036             : 
    1037             : static void
    1038          15 : insert_vmap_area(struct vmap_area *va,
    1039             :         struct rb_root *root, struct list_head *head)
    1040             : {
    1041             :         struct rb_node **link;
    1042             :         struct rb_node *parent;
    1043             : 
    1044          15 :         link = find_va_links(va, root, NULL, &parent);
    1045          15 :         if (link)
    1046          15 :                 link_va(va, root, parent, link, head);
    1047          15 : }
    1048             : 
    1049             : static void
    1050          16 : insert_vmap_area_augment(struct vmap_area *va,
    1051             :         struct rb_node *from, struct rb_root *root,
    1052             :         struct list_head *head)
    1053             : {
    1054             :         struct rb_node **link;
    1055             :         struct rb_node *parent;
    1056             : 
    1057          16 :         if (from)
    1058             :                 link = find_va_links(va, NULL, from, &parent);
    1059             :         else
    1060             :                 link = find_va_links(va, root, NULL, &parent);
    1061             : 
    1062          16 :         if (link) {
    1063          32 :                 link_va(va, root, parent, link, head);
    1064             :                 augment_tree_propagate_from(va);
    1065             :         }
    1066          16 : }
    1067             : 
    1068             : /*
    1069             :  * Merge de-allocated chunk of VA memory with previous
    1070             :  * and next free blocks. If coalesce is not done a new
    1071             :  * free area is inserted. If VA has been merged, it is
    1072             :  * freed.
    1073             :  *
    1074             :  * Please note, it can return NULL in case of overlap
    1075             :  * ranges, followed by WARN() report. Despite it is a
    1076             :  * buggy behaviour, a system can be alive and keep
    1077             :  * ongoing.
    1078             :  */
    1079             : static __always_inline struct vmap_area *
    1080             : merge_or_add_vmap_area(struct vmap_area *va,
    1081             :         struct rb_root *root, struct list_head *head)
    1082             : {
    1083             :         struct vmap_area *sibling;
    1084             :         struct list_head *next;
    1085             :         struct rb_node **link;
    1086             :         struct rb_node *parent;
    1087           0 :         bool merged = false;
    1088             : 
    1089             :         /*
    1090             :          * Find a place in the tree where VA potentially will be
    1091             :          * inserted, unless it is merged with its sibling/siblings.
    1092             :          */
    1093           0 :         link = find_va_links(va, root, NULL, &parent);
    1094           0 :         if (!link)
    1095             :                 return NULL;
    1096             : 
    1097             :         /*
    1098             :          * Get next node of VA to check if merging can be done.
    1099             :          */
    1100           0 :         next = get_va_next_sibling(parent, link);
    1101           0 :         if (unlikely(next == NULL))
    1102             :                 goto insert;
    1103             : 
    1104             :         /*
    1105             :          * start            end
    1106             :          * |                |
    1107             :          * |<------VA------>|<-----Next----->|
    1108             :          *                  |                |
    1109             :          *                  start            end
    1110             :          */
    1111           0 :         if (next != head) {
    1112           0 :                 sibling = list_entry(next, struct vmap_area, list);
    1113           0 :                 if (sibling->va_start == va->va_end) {
    1114           0 :                         sibling->va_start = va->va_start;
    1115             : 
    1116             :                         /* Free vmap_area object. */
    1117           0 :                         kmem_cache_free(vmap_area_cachep, va);
    1118             : 
    1119             :                         /* Point to the new merged area. */
    1120           0 :                         va = sibling;
    1121           0 :                         merged = true;
    1122             :                 }
    1123             :         }
    1124             : 
    1125             :         /*
    1126             :          * start            end
    1127             :          * |                |
    1128             :          * |<-----Prev----->|<------VA------>|
    1129             :          *                  |                |
    1130             :          *                  start            end
    1131             :          */
    1132           0 :         if (next->prev != head) {
    1133           0 :                 sibling = list_entry(next->prev, struct vmap_area, list);
    1134           0 :                 if (sibling->va_end == va->va_start) {
    1135             :                         /*
    1136             :                          * If both neighbors are coalesced, it is important
    1137             :                          * to unlink the "next" node first, followed by merging
    1138             :                          * with "previous" one. Otherwise the tree might not be
    1139             :                          * fully populated if a sibling's augmented value is
    1140             :                          * "normalized" because of rotation operations.
    1141             :                          */
    1142           0 :                         if (merged)
    1143             :                                 unlink_va(va, root);
    1144             : 
    1145           0 :                         sibling->va_end = va->va_end;
    1146             : 
    1147             :                         /* Free vmap_area object. */
    1148           0 :                         kmem_cache_free(vmap_area_cachep, va);
    1149             : 
    1150             :                         /* Point to the new merged area. */
    1151           0 :                         va = sibling;
    1152           0 :                         merged = true;
    1153             :                 }
    1154             :         }
    1155             : 
    1156             : insert:
    1157           0 :         if (!merged)
    1158           0 :                 link_va(va, root, parent, link, head);
    1159             : 
    1160             :         return va;
    1161             : }
    1162             : 
    1163             : static __always_inline struct vmap_area *
    1164             : merge_or_add_vmap_area_augment(struct vmap_area *va,
    1165             :         struct rb_root *root, struct list_head *head)
    1166             : {
    1167           0 :         va = merge_or_add_vmap_area(va, root, head);
    1168           0 :         if (va)
    1169             :                 augment_tree_propagate_from(va);
    1170             : 
    1171             :         return va;
    1172             : }
    1173             : 
    1174             : static __always_inline bool
    1175             : is_within_this_va(struct vmap_area *va, unsigned long size,
    1176             :         unsigned long align, unsigned long vstart)
    1177             : {
    1178             :         unsigned long nva_start_addr;
    1179             : 
    1180          74 :         if (va->va_start > vstart)
    1181          59 :                 nva_start_addr = ALIGN(va->va_start, align);
    1182             :         else
    1183          15 :                 nva_start_addr = ALIGN(vstart, align);
    1184             : 
    1185             :         /* Can be overflowed due to big size or alignment. */
    1186          74 :         if (nva_start_addr + size < nva_start_addr ||
    1187             :                         nva_start_addr < vstart)
    1188             :                 return false;
    1189             : 
    1190          74 :         return (nva_start_addr + size <= va->va_end);
    1191             : }
    1192             : 
    1193             : /*
    1194             :  * Find the first free block(lowest start address) in the tree,
    1195             :  * that will accomplish the request corresponding to passing
    1196             :  * parameters. Please note, with an alignment bigger than PAGE_SIZE,
    1197             :  * a search length is adjusted to account for worst case alignment
    1198             :  * overhead.
    1199             :  */
    1200             : static __always_inline struct vmap_area *
    1201             : find_vmap_lowest_match(unsigned long size, unsigned long align,
    1202             :         unsigned long vstart, bool adjust_search_size)
    1203             : {
    1204             :         struct vmap_area *va;
    1205             :         struct rb_node *node;
    1206             :         unsigned long length;
    1207             : 
    1208             :         /* Start from the root. */
    1209          15 :         node = free_vmap_area_root.rb_node;
    1210             : 
    1211             :         /* Adjust the search size for alignment overhead. */
    1212          15 :         length = adjust_search_size ? size + align - 1 : size;
    1213             : 
    1214          74 :         while (node) {
    1215          74 :                 va = rb_entry(node, struct vmap_area, rb_node);
    1216             : 
    1217         170 :                 if (get_subtree_max_size(node->rb_left) >= length &&
    1218          22 :                                 vstart < va->va_start) {
    1219             :                         node = node->rb_left;
    1220             :                 } else {
    1221          52 :                         if (is_within_this_va(va, size, align, vstart))
    1222             :                                 return va;
    1223             : 
    1224             :                         /*
    1225             :                          * Does not make sense to go deeper towards the right
    1226             :                          * sub-tree if it does not have a free block that is
    1227             :                          * equal or bigger to the requested search length.
    1228             :                          */
    1229          76 :                         if (get_subtree_max_size(node->rb_right) >= length) {
    1230          24 :                                 node = node->rb_right;
    1231          24 :                                 continue;
    1232             :                         }
    1233             : 
    1234             :                         /*
    1235             :                          * OK. We roll back and find the first right sub-tree,
    1236             :                          * that will satisfy the search criteria. It can happen
    1237             :                          * due to "vstart" restriction or an alignment overhead
    1238             :                          * that is bigger then PAGE_SIZE.
    1239             :                          */
    1240          22 :                         while ((node = rb_parent(node))) {
    1241          22 :                                 va = rb_entry(node, struct vmap_area, rb_node);
    1242          22 :                                 if (is_within_this_va(va, size, align, vstart))
    1243             :                                         return va;
    1244             : 
    1245          42 :                                 if (get_subtree_max_size(node->rb_right) >= length &&
    1246             :                                                 vstart <= va->va_start) {
    1247             :                                         /*
    1248             :                                          * Shift the vstart forward. Please note, we update it with
    1249             :                                          * parent's start address adding "1" because we do not want
    1250             :                                          * to enter same sub-tree after it has already been checked
    1251             :                                          * and no suitable free block found there.
    1252             :                                          */
    1253          13 :                                         vstart = va->va_start + 1;
    1254          13 :                                         node = node->rb_right;
    1255             :                                         break;
    1256             :                                 }
    1257             :                         }
    1258             :                 }
    1259             :         }
    1260             : 
    1261             :         return NULL;
    1262             : }
    1263             : 
    1264             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1265             : #include <linux/random.h>
    1266             : 
    1267             : static struct vmap_area *
    1268             : find_vmap_lowest_linear_match(unsigned long size,
    1269             :         unsigned long align, unsigned long vstart)
    1270             : {
    1271             :         struct vmap_area *va;
    1272             : 
    1273             :         list_for_each_entry(va, &free_vmap_area_list, list) {
    1274             :                 if (!is_within_this_va(va, size, align, vstart))
    1275             :                         continue;
    1276             : 
    1277             :                 return va;
    1278             :         }
    1279             : 
    1280             :         return NULL;
    1281             : }
    1282             : 
    1283             : static void
    1284             : find_vmap_lowest_match_check(unsigned long size, unsigned long align)
    1285             : {
    1286             :         struct vmap_area *va_1, *va_2;
    1287             :         unsigned long vstart;
    1288             :         unsigned int rnd;
    1289             : 
    1290             :         get_random_bytes(&rnd, sizeof(rnd));
    1291             :         vstart = VMALLOC_START + rnd;
    1292             : 
    1293             :         va_1 = find_vmap_lowest_match(size, align, vstart, false);
    1294             :         va_2 = find_vmap_lowest_linear_match(size, align, vstart);
    1295             : 
    1296             :         if (va_1 != va_2)
    1297             :                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
    1298             :                         va_1, va_2, vstart);
    1299             : }
    1300             : #endif
    1301             : 
    1302             : enum fit_type {
    1303             :         NOTHING_FIT = 0,
    1304             :         FL_FIT_TYPE = 1,        /* full fit */
    1305             :         LE_FIT_TYPE = 2,        /* left edge fit */
    1306             :         RE_FIT_TYPE = 3,        /* right edge fit */
    1307             :         NE_FIT_TYPE = 4         /* no edge fit */
    1308             : };
    1309             : 
    1310             : static __always_inline enum fit_type
    1311             : classify_va_fit_type(struct vmap_area *va,
    1312             :         unsigned long nva_start_addr, unsigned long size)
    1313             : {
    1314             :         enum fit_type type;
    1315             : 
    1316             :         /* Check if it is within VA. */
    1317          30 :         if (nva_start_addr < va->va_start ||
    1318          15 :                         nva_start_addr + size > va->va_end)
    1319             :                 return NOTHING_FIT;
    1320             : 
    1321             :         /* Now classify. */
    1322          15 :         if (va->va_start == nva_start_addr) {
    1323           0 :                 if (va->va_end == nva_start_addr + size)
    1324             :                         type = FL_FIT_TYPE;
    1325             :                 else
    1326           0 :                         type = LE_FIT_TYPE;
    1327          15 :         } else if (va->va_end == nva_start_addr + size) {
    1328             :                 type = RE_FIT_TYPE;
    1329             :         } else {
    1330          15 :                 type = NE_FIT_TYPE;
    1331             :         }
    1332             : 
    1333             :         return type;
    1334             : }
    1335             : 
    1336             : static __always_inline int
    1337             : adjust_va_to_fit_type(struct vmap_area *va,
    1338             :         unsigned long nva_start_addr, unsigned long size,
    1339             :         enum fit_type type)
    1340             : {
    1341          15 :         struct vmap_area *lva = NULL;
    1342             : 
    1343          15 :         if (type == FL_FIT_TYPE) {
    1344             :                 /*
    1345             :                  * No need to split VA, it fully fits.
    1346             :                  *
    1347             :                  * |               |
    1348             :                  * V      NVA      V
    1349             :                  * |---------------|
    1350             :                  */
    1351           0 :                 unlink_va(va, &free_vmap_area_root);
    1352           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1353          15 :         } else if (type == LE_FIT_TYPE) {
    1354             :                 /*
    1355             :                  * Split left edge of fit VA.
    1356             :                  *
    1357             :                  * |       |
    1358             :                  * V  NVA  V   R
    1359             :                  * |-------|-------|
    1360             :                  */
    1361           0 :                 va->va_start += size;
    1362          15 :         } else if (type == RE_FIT_TYPE) {
    1363             :                 /*
    1364             :                  * Split right edge of fit VA.
    1365             :                  *
    1366             :                  *         |       |
    1367             :                  *     L   V  NVA  V
    1368             :                  * |-------|-------|
    1369             :                  */
    1370           0 :                 va->va_end = nva_start_addr;
    1371          15 :         } else if (type == NE_FIT_TYPE) {
    1372             :                 /*
    1373             :                  * Split no edge of fit VA.
    1374             :                  *
    1375             :                  *     |       |
    1376             :                  *   L V  NVA  V R
    1377             :                  * |---|-------|---|
    1378             :                  */
    1379          15 :                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
    1380          15 :                 if (unlikely(!lva)) {
    1381             :                         /*
    1382             :                          * For percpu allocator we do not do any pre-allocation
    1383             :                          * and leave it as it is. The reason is it most likely
    1384             :                          * never ends up with NE_FIT_TYPE splitting. In case of
    1385             :                          * percpu allocations offsets and sizes are aligned to
    1386             :                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
    1387             :                          * are its main fitting cases.
    1388             :                          *
    1389             :                          * There are a few exceptions though, as an example it is
    1390             :                          * a first allocation (early boot up) when we have "one"
    1391             :                          * big free space that has to be split.
    1392             :                          *
    1393             :                          * Also we can hit this path in case of regular "vmap"
    1394             :                          * allocations, if "this" current CPU was not preloaded.
    1395             :                          * See the comment in alloc_vmap_area() why. If so, then
    1396             :                          * GFP_NOWAIT is used instead to get an extra object for
    1397             :                          * split purpose. That is rare and most time does not
    1398             :                          * occur.
    1399             :                          *
    1400             :                          * What happens if an allocation gets failed. Basically,
    1401             :                          * an "overflow" path is triggered to purge lazily freed
    1402             :                          * areas to free some memory, then, the "retry" path is
    1403             :                          * triggered to repeat one more time. See more details
    1404             :                          * in alloc_vmap_area() function.
    1405             :                          */
    1406           0 :                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
    1407           0 :                         if (!lva)
    1408             :                                 return -1;
    1409             :                 }
    1410             : 
    1411             :                 /*
    1412             :                  * Build the remainder.
    1413             :                  */
    1414          15 :                 lva->va_start = va->va_start;
    1415          15 :                 lva->va_end = nva_start_addr;
    1416             : 
    1417             :                 /*
    1418             :                  * Shrink this VA to remaining size.
    1419             :                  */
    1420          15 :                 va->va_start = nva_start_addr + size;
    1421             :         } else {
    1422             :                 return -1;
    1423             :         }
    1424             : 
    1425          15 :         if (type != FL_FIT_TYPE) {
    1426          15 :                 augment_tree_propagate_from(va);
    1427             : 
    1428          15 :                 if (lva)        /* type == NE_FIT_TYPE */
    1429          15 :                         insert_vmap_area_augment(lva, &va->rb_node,
    1430             :                                 &free_vmap_area_root, &free_vmap_area_list);
    1431             :         }
    1432             : 
    1433             :         return 0;
    1434             : }
    1435             : 
    1436             : /*
    1437             :  * Returns a start address of the newly allocated area, if success.
    1438             :  * Otherwise a vend is returned that indicates failure.
    1439             :  */
    1440             : static __always_inline unsigned long
    1441             : __alloc_vmap_area(unsigned long size, unsigned long align,
    1442             :         unsigned long vstart, unsigned long vend)
    1443             : {
    1444          15 :         bool adjust_search_size = true;
    1445             :         unsigned long nva_start_addr;
    1446             :         struct vmap_area *va;
    1447             :         enum fit_type type;
    1448             :         int ret;
    1449             : 
    1450             :         /*
    1451             :          * Do not adjust when:
    1452             :          *   a) align <= PAGE_SIZE, because it does not make any sense.
    1453             :          *      All blocks(their start addresses) are at least PAGE_SIZE
    1454             :          *      aligned anyway;
    1455             :          *   b) a short range where a requested size corresponds to exactly
    1456             :          *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
    1457             :          *      With adjusted search length an allocation would not succeed.
    1458             :          */
    1459          15 :         if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
    1460           0 :                 adjust_search_size = false;
    1461             : 
    1462          30 :         va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
    1463          15 :         if (unlikely(!va))
    1464             :                 return vend;
    1465             : 
    1466          15 :         if (va->va_start > vstart)
    1467          14 :                 nva_start_addr = ALIGN(va->va_start, align);
    1468             :         else
    1469           1 :                 nva_start_addr = ALIGN(vstart, align);
    1470             : 
    1471             :         /* Check the "vend" restriction. */
    1472          15 :         if (nva_start_addr + size > vend)
    1473             :                 return vend;
    1474             : 
    1475             :         /* Classify what we have found. */
    1476          15 :         type = classify_va_fit_type(va, nva_start_addr, size);
    1477          15 :         if (WARN_ON_ONCE(type == NOTHING_FIT))
    1478             :                 return vend;
    1479             : 
    1480             :         /* Update the free vmap_area. */
    1481          15 :         ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
    1482          15 :         if (ret)
    1483             :                 return vend;
    1484             : 
    1485             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1486             :         find_vmap_lowest_match_check(size, align);
    1487             : #endif
    1488             : 
    1489             :         return nva_start_addr;
    1490             : }
    1491             : 
    1492             : /*
    1493             :  * Free a region of KVA allocated by alloc_vmap_area
    1494             :  */
    1495           0 : static void free_vmap_area(struct vmap_area *va)
    1496             : {
    1497             :         /*
    1498             :          * Remove from the busy tree/list.
    1499             :          */
    1500           0 :         spin_lock(&vmap_area_lock);
    1501           0 :         unlink_va(va, &vmap_area_root);
    1502           0 :         spin_unlock(&vmap_area_lock);
    1503             : 
    1504             :         /*
    1505             :          * Insert/Merge it back to the free tree/list.
    1506             :          */
    1507           0 :         spin_lock(&free_vmap_area_lock);
    1508           0 :         merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
    1509           0 :         spin_unlock(&free_vmap_area_lock);
    1510           0 : }
    1511             : 
    1512             : static inline void
    1513          15 : preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
    1514             : {
    1515          15 :         struct vmap_area *va = NULL;
    1516             : 
    1517             :         /*
    1518             :          * Preload this CPU with one extra vmap_area object. It is used
    1519             :          * when fit type of free area is NE_FIT_TYPE. It guarantees that
    1520             :          * a CPU that does an allocation is preloaded.
    1521             :          *
    1522             :          * We do it in non-atomic context, thus it allows us to use more
    1523             :          * permissive allocation masks to be more stable under low memory
    1524             :          * condition and high memory pressure.
    1525             :          */
    1526          15 :         if (!this_cpu_read(ne_fit_preload_node))
    1527          30 :                 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1528             : 
    1529          15 :         spin_lock(lock);
    1530             : 
    1531          15 :         if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
    1532           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1533          15 : }
    1534             : 
    1535             : /*
    1536             :  * Allocate a region of KVA of the specified size and alignment, within the
    1537             :  * vstart and vend.
    1538             :  */
    1539          15 : static struct vmap_area *alloc_vmap_area(unsigned long size,
    1540             :                                 unsigned long align,
    1541             :                                 unsigned long vstart, unsigned long vend,
    1542             :                                 int node, gfp_t gfp_mask)
    1543             : {
    1544             :         struct vmap_area *va;
    1545             :         unsigned long freed;
    1546             :         unsigned long addr;
    1547          15 :         int purged = 0;
    1548             :         int ret;
    1549             : 
    1550          15 :         BUG_ON(!size);
    1551          15 :         BUG_ON(offset_in_page(size));
    1552          15 :         BUG_ON(!is_power_of_2(align));
    1553             : 
    1554          15 :         if (unlikely(!vmap_initialized))
    1555             :                 return ERR_PTR(-EBUSY);
    1556             : 
    1557             :         might_sleep();
    1558          15 :         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
    1559             : 
    1560          30 :         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1561          15 :         if (unlikely(!va))
    1562             :                 return ERR_PTR(-ENOMEM);
    1563             : 
    1564             :         /*
    1565             :          * Only scan the relevant parts containing pointers to other objects
    1566             :          * to avoid false negatives.
    1567             :          */
    1568             :         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
    1569             : 
    1570             : retry:
    1571          15 :         preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
    1572          15 :         addr = __alloc_vmap_area(size, align, vstart, vend);
    1573          15 :         spin_unlock(&free_vmap_area_lock);
    1574             : 
    1575             :         /*
    1576             :          * If an allocation fails, the "vend" address is
    1577             :          * returned. Therefore trigger the overflow path.
    1578             :          */
    1579          15 :         if (unlikely(addr == vend))
    1580             :                 goto overflow;
    1581             : 
    1582          15 :         va->va_start = addr;
    1583          15 :         va->va_end = addr + size;
    1584          15 :         va->vm = NULL;
    1585             : 
    1586          15 :         spin_lock(&vmap_area_lock);
    1587          15 :         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    1588          15 :         spin_unlock(&vmap_area_lock);
    1589             : 
    1590          15 :         BUG_ON(!IS_ALIGNED(va->va_start, align));
    1591          15 :         BUG_ON(va->va_start < vstart);
    1592          15 :         BUG_ON(va->va_end > vend);
    1593             : 
    1594             :         ret = kasan_populate_vmalloc(addr, size);
    1595             :         if (ret) {
    1596             :                 free_vmap_area(va);
    1597             :                 return ERR_PTR(ret);
    1598             :         }
    1599             : 
    1600             :         return va;
    1601             : 
    1602             : overflow:
    1603           0 :         if (!purged) {
    1604           0 :                 purge_vmap_area_lazy();
    1605           0 :                 purged = 1;
    1606             :                 goto retry;
    1607             :         }
    1608             : 
    1609           0 :         freed = 0;
    1610           0 :         blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
    1611             : 
    1612           0 :         if (freed > 0) {
    1613             :                 purged = 0;
    1614             :                 goto retry;
    1615             :         }
    1616             : 
    1617           0 :         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
    1618           0 :                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
    1619             :                         size);
    1620             : 
    1621           0 :         kmem_cache_free(vmap_area_cachep, va);
    1622           0 :         return ERR_PTR(-EBUSY);
    1623             : }
    1624             : 
    1625           0 : int register_vmap_purge_notifier(struct notifier_block *nb)
    1626             : {
    1627           0 :         return blocking_notifier_chain_register(&vmap_notify_list, nb);
    1628             : }
    1629             : EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
    1630             : 
    1631           0 : int unregister_vmap_purge_notifier(struct notifier_block *nb)
    1632             : {
    1633           0 :         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
    1634             : }
    1635             : EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
    1636             : 
    1637             : /*
    1638             :  * lazy_max_pages is the maximum amount of virtual address space we gather up
    1639             :  * before attempting to purge with a TLB flush.
    1640             :  *
    1641             :  * There is a tradeoff here: a larger number will cover more kernel page tables
    1642             :  * and take slightly longer to purge, but it will linearly reduce the number of
    1643             :  * global TLB flushes that must be performed. It would seem natural to scale
    1644             :  * this number up linearly with the number of CPUs (because vmapping activity
    1645             :  * could also scale linearly with the number of CPUs), however it is likely
    1646             :  * that in practice, workloads might be constrained in other ways that mean
    1647             :  * vmap activity will not scale linearly with CPUs. Also, I want to be
    1648             :  * conservative and not introduce a big latency on huge systems, so go with
    1649             :  * a less aggressive log scale. It will still be an improvement over the old
    1650             :  * code, and it will be simple to change the scale factor if we find that it
    1651             :  * becomes a problem on bigger systems.
    1652             :  */
    1653             : static unsigned long lazy_max_pages(void)
    1654             : {
    1655             :         unsigned int log;
    1656             : 
    1657           0 :         log = fls(num_online_cpus());
    1658             : 
    1659           0 :         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
    1660             : }
    1661             : 
    1662             : static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
    1663             : 
    1664             : /*
    1665             :  * Serialize vmap purging.  There is no actual critical section protected
    1666             :  * by this look, but we want to avoid concurrent calls for performance
    1667             :  * reasons and to make the pcpu_get_vm_areas more deterministic.
    1668             :  */
    1669             : static DEFINE_MUTEX(vmap_purge_lock);
    1670             : 
    1671             : /* for per-CPU blocks */
    1672             : static void purge_fragmented_blocks_allcpus(void);
    1673             : 
    1674             : /*
    1675             :  * Purges all lazily-freed vmap areas.
    1676             :  */
    1677           0 : static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
    1678             : {
    1679             :         unsigned long resched_threshold;
    1680             :         struct list_head local_pure_list;
    1681             :         struct vmap_area *va, *n_va;
    1682             : 
    1683             :         lockdep_assert_held(&vmap_purge_lock);
    1684             : 
    1685           0 :         spin_lock(&purge_vmap_area_lock);
    1686           0 :         purge_vmap_area_root = RB_ROOT;
    1687           0 :         list_replace_init(&purge_vmap_area_list, &local_pure_list);
    1688           0 :         spin_unlock(&purge_vmap_area_lock);
    1689             : 
    1690           0 :         if (unlikely(list_empty(&local_pure_list)))
    1691             :                 return false;
    1692             : 
    1693           0 :         start = min(start,
    1694             :                 list_first_entry(&local_pure_list,
    1695             :                         struct vmap_area, list)->va_start);
    1696             : 
    1697           0 :         end = max(end,
    1698             :                 list_last_entry(&local_pure_list,
    1699             :                         struct vmap_area, list)->va_end);
    1700             : 
    1701           0 :         flush_tlb_kernel_range(start, end);
    1702           0 :         resched_threshold = lazy_max_pages() << 1;
    1703             : 
    1704           0 :         spin_lock(&free_vmap_area_lock);
    1705           0 :         list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
    1706           0 :                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
    1707           0 :                 unsigned long orig_start = va->va_start;
    1708           0 :                 unsigned long orig_end = va->va_end;
    1709             : 
    1710             :                 /*
    1711             :                  * Finally insert or merge lazily-freed area. It is
    1712             :                  * detached and there is no need to "unlink" it from
    1713             :                  * anything.
    1714             :                  */
    1715           0 :                 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
    1716             :                                 &free_vmap_area_list);
    1717             : 
    1718           0 :                 if (!va)
    1719           0 :                         continue;
    1720             : 
    1721           0 :                 if (is_vmalloc_or_module_addr((void *)orig_start))
    1722             :                         kasan_release_vmalloc(orig_start, orig_end,
    1723             :                                               va->va_start, va->va_end);
    1724             : 
    1725           0 :                 atomic_long_sub(nr, &vmap_lazy_nr);
    1726             : 
    1727           0 :                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
    1728           0 :                         cond_resched_lock(&free_vmap_area_lock);
    1729             :         }
    1730           0 :         spin_unlock(&free_vmap_area_lock);
    1731           0 :         return true;
    1732             : }
    1733             : 
    1734             : /*
    1735             :  * Kick off a purge of the outstanding lazy areas.
    1736             :  */
    1737           0 : static void purge_vmap_area_lazy(void)
    1738             : {
    1739           0 :         mutex_lock(&vmap_purge_lock);
    1740           0 :         purge_fragmented_blocks_allcpus();
    1741           0 :         __purge_vmap_area_lazy(ULONG_MAX, 0);
    1742           0 :         mutex_unlock(&vmap_purge_lock);
    1743           0 : }
    1744             : 
    1745           0 : static void drain_vmap_area_work(struct work_struct *work)
    1746             : {
    1747             :         unsigned long nr_lazy;
    1748             : 
    1749             :         do {
    1750           0 :                 mutex_lock(&vmap_purge_lock);
    1751           0 :                 __purge_vmap_area_lazy(ULONG_MAX, 0);
    1752           0 :                 mutex_unlock(&vmap_purge_lock);
    1753             : 
    1754             :                 /* Recheck if further work is required. */
    1755           0 :                 nr_lazy = atomic_long_read(&vmap_lazy_nr);
    1756           0 :         } while (nr_lazy > lazy_max_pages());
    1757           0 : }
    1758             : 
    1759             : /*
    1760             :  * Free a vmap area, caller ensuring that the area has been unmapped
    1761             :  * and flush_cache_vunmap had been called for the correct range
    1762             :  * previously.
    1763             :  */
    1764           0 : static void free_vmap_area_noflush(struct vmap_area *va)
    1765             : {
    1766             :         unsigned long nr_lazy;
    1767             : 
    1768           0 :         spin_lock(&vmap_area_lock);
    1769           0 :         unlink_va(va, &vmap_area_root);
    1770           0 :         spin_unlock(&vmap_area_lock);
    1771             : 
    1772           0 :         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
    1773             :                                 PAGE_SHIFT, &vmap_lazy_nr);
    1774             : 
    1775             :         /*
    1776             :          * Merge or place it to the purge tree/list.
    1777             :          */
    1778           0 :         spin_lock(&purge_vmap_area_lock);
    1779           0 :         merge_or_add_vmap_area(va,
    1780             :                 &purge_vmap_area_root, &purge_vmap_area_list);
    1781           0 :         spin_unlock(&purge_vmap_area_lock);
    1782             : 
    1783             :         /* After this point, we may free va at any time */
    1784           0 :         if (unlikely(nr_lazy > lazy_max_pages()))
    1785             :                 schedule_work(&drain_vmap_work);
    1786           0 : }
    1787             : 
    1788             : /*
    1789             :  * Free and unmap a vmap area
    1790             :  */
    1791           0 : static void free_unmap_vmap_area(struct vmap_area *va)
    1792             : {
    1793           0 :         flush_cache_vunmap(va->va_start, va->va_end);
    1794           0 :         vunmap_range_noflush(va->va_start, va->va_end);
    1795             :         if (debug_pagealloc_enabled_static())
    1796             :                 flush_tlb_kernel_range(va->va_start, va->va_end);
    1797             : 
    1798           0 :         free_vmap_area_noflush(va);
    1799           0 : }
    1800             : 
    1801             : static struct vmap_area *find_vmap_area(unsigned long addr)
    1802             : {
    1803             :         struct vmap_area *va;
    1804             : 
    1805          15 :         spin_lock(&vmap_area_lock);
    1806          15 :         va = __find_vmap_area(addr);
    1807          15 :         spin_unlock(&vmap_area_lock);
    1808             : 
    1809             :         return va;
    1810             : }
    1811             : 
    1812             : /*** Per cpu kva allocator ***/
    1813             : 
    1814             : /*
    1815             :  * vmap space is limited especially on 32 bit architectures. Ensure there is
    1816             :  * room for at least 16 percpu vmap blocks per CPU.
    1817             :  */
    1818             : /*
    1819             :  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
    1820             :  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
    1821             :  * instead (we just need a rough idea)
    1822             :  */
    1823             : #if BITS_PER_LONG == 32
    1824             : #define VMALLOC_SPACE           (128UL*1024*1024)
    1825             : #else
    1826             : #define VMALLOC_SPACE           (128UL*1024*1024*1024)
    1827             : #endif
    1828             : 
    1829             : #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
    1830             : #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
    1831             : #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
    1832             : #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
    1833             : #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
    1834             : #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
    1835             : #define VMAP_BBMAP_BITS         \
    1836             :                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
    1837             :                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
    1838             :                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
    1839             : 
    1840             : #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
    1841             : 
    1842             : struct vmap_block_queue {
    1843             :         spinlock_t lock;
    1844             :         struct list_head free;
    1845             : };
    1846             : 
    1847             : struct vmap_block {
    1848             :         spinlock_t lock;
    1849             :         struct vmap_area *va;
    1850             :         unsigned long free, dirty;
    1851             :         unsigned long dirty_min, dirty_max; /*< dirty range */
    1852             :         struct list_head free_list;
    1853             :         struct rcu_head rcu_head;
    1854             :         struct list_head purge;
    1855             : };
    1856             : 
    1857             : /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
    1858             : static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
    1859             : 
    1860             : /*
    1861             :  * XArray of vmap blocks, indexed by address, to quickly find a vmap block
    1862             :  * in the free path. Could get rid of this if we change the API to return a
    1863             :  * "cookie" from alloc, to be passed to free. But no big deal yet.
    1864             :  */
    1865             : static DEFINE_XARRAY(vmap_blocks);
    1866             : 
    1867             : /*
    1868             :  * We should probably have a fallback mechanism to allocate virtual memory
    1869             :  * out of partially filled vmap blocks. However vmap block sizing should be
    1870             :  * fairly reasonable according to the vmalloc size, so it shouldn't be a
    1871             :  * big problem.
    1872             :  */
    1873             : 
    1874             : static unsigned long addr_to_vb_idx(unsigned long addr)
    1875             : {
    1876           0 :         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
    1877           0 :         addr /= VMAP_BLOCK_SIZE;
    1878             :         return addr;
    1879             : }
    1880             : 
    1881           0 : static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
    1882             : {
    1883             :         unsigned long addr;
    1884             : 
    1885           0 :         addr = va_start + (pages_off << PAGE_SHIFT);
    1886           0 :         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
    1887           0 :         return (void *)addr;
    1888             : }
    1889             : 
    1890             : /**
    1891             :  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
    1892             :  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
    1893             :  * @order:    how many 2^order pages should be occupied in newly allocated block
    1894             :  * @gfp_mask: flags for the page level allocator
    1895             :  *
    1896             :  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
    1897             :  */
    1898           0 : static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
    1899             : {
    1900             :         struct vmap_block_queue *vbq;
    1901             :         struct vmap_block *vb;
    1902             :         struct vmap_area *va;
    1903             :         unsigned long vb_idx;
    1904             :         int node, err;
    1905             :         void *vaddr;
    1906             : 
    1907           0 :         node = numa_node_id();
    1908             : 
    1909           0 :         vb = kmalloc_node(sizeof(struct vmap_block),
    1910             :                         gfp_mask & GFP_RECLAIM_MASK, node);
    1911           0 :         if (unlikely(!vb))
    1912             :                 return ERR_PTR(-ENOMEM);
    1913             : 
    1914           0 :         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
    1915           0 :                                         VMALLOC_START, VMALLOC_END,
    1916             :                                         node, gfp_mask);
    1917           0 :         if (IS_ERR(va)) {
    1918           0 :                 kfree(vb);
    1919           0 :                 return ERR_CAST(va);
    1920             :         }
    1921             : 
    1922           0 :         vaddr = vmap_block_vaddr(va->va_start, 0);
    1923           0 :         spin_lock_init(&vb->lock);
    1924           0 :         vb->va = va;
    1925             :         /* At least something should be left free */
    1926           0 :         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
    1927           0 :         vb->free = VMAP_BBMAP_BITS - (1UL << order);
    1928           0 :         vb->dirty = 0;
    1929           0 :         vb->dirty_min = VMAP_BBMAP_BITS;
    1930           0 :         vb->dirty_max = 0;
    1931           0 :         INIT_LIST_HEAD(&vb->free_list);
    1932             : 
    1933           0 :         vb_idx = addr_to_vb_idx(va->va_start);
    1934           0 :         err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
    1935           0 :         if (err) {
    1936           0 :                 kfree(vb);
    1937           0 :                 free_vmap_area(va);
    1938           0 :                 return ERR_PTR(err);
    1939             :         }
    1940             : 
    1941           0 :         vbq = &get_cpu_var(vmap_block_queue);
    1942           0 :         spin_lock(&vbq->lock);
    1943           0 :         list_add_tail_rcu(&vb->free_list, &vbq->free);
    1944           0 :         spin_unlock(&vbq->lock);
    1945           0 :         put_cpu_var(vmap_block_queue);
    1946             : 
    1947           0 :         return vaddr;
    1948             : }
    1949             : 
    1950           0 : static void free_vmap_block(struct vmap_block *vb)
    1951             : {
    1952             :         struct vmap_block *tmp;
    1953             : 
    1954           0 :         tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
    1955           0 :         BUG_ON(tmp != vb);
    1956             : 
    1957           0 :         free_vmap_area_noflush(vb->va);
    1958           0 :         kfree_rcu(vb, rcu_head);
    1959           0 : }
    1960             : 
    1961           0 : static void purge_fragmented_blocks(int cpu)
    1962             : {
    1963           0 :         LIST_HEAD(purge);
    1964             :         struct vmap_block *vb;
    1965             :         struct vmap_block *n_vb;
    1966           0 :         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    1967             : 
    1968             :         rcu_read_lock();
    1969           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    1970             : 
    1971           0 :                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
    1972           0 :                         continue;
    1973             : 
    1974           0 :                 spin_lock(&vb->lock);
    1975           0 :                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
    1976           0 :                         vb->free = 0; /* prevent further allocs after releasing lock */
    1977           0 :                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
    1978           0 :                         vb->dirty_min = 0;
    1979           0 :                         vb->dirty_max = VMAP_BBMAP_BITS;
    1980           0 :                         spin_lock(&vbq->lock);
    1981           0 :                         list_del_rcu(&vb->free_list);
    1982           0 :                         spin_unlock(&vbq->lock);
    1983           0 :                         spin_unlock(&vb->lock);
    1984           0 :                         list_add_tail(&vb->purge, &purge);
    1985             :                 } else
    1986           0 :                         spin_unlock(&vb->lock);
    1987             :         }
    1988             :         rcu_read_unlock();
    1989             : 
    1990           0 :         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
    1991           0 :                 list_del(&vb->purge);
    1992           0 :                 free_vmap_block(vb);
    1993             :         }
    1994           0 : }
    1995             : 
    1996             : static void purge_fragmented_blocks_allcpus(void)
    1997             : {
    1998             :         int cpu;
    1999             : 
    2000           0 :         for_each_possible_cpu(cpu)
    2001           0 :                 purge_fragmented_blocks(cpu);
    2002             : }
    2003             : 
    2004           0 : static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
    2005             : {
    2006             :         struct vmap_block_queue *vbq;
    2007             :         struct vmap_block *vb;
    2008           0 :         void *vaddr = NULL;
    2009             :         unsigned int order;
    2010             : 
    2011           0 :         BUG_ON(offset_in_page(size));
    2012           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2013           0 :         if (WARN_ON(size == 0)) {
    2014             :                 /*
    2015             :                  * Allocating 0 bytes isn't what caller wants since
    2016             :                  * get_order(0) returns funny result. Just warn and terminate
    2017             :                  * early.
    2018             :                  */
    2019             :                 return NULL;
    2020             :         }
    2021           0 :         order = get_order(size);
    2022             : 
    2023             :         rcu_read_lock();
    2024           0 :         vbq = &get_cpu_var(vmap_block_queue);
    2025           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2026             :                 unsigned long pages_off;
    2027             : 
    2028           0 :                 spin_lock(&vb->lock);
    2029           0 :                 if (vb->free < (1UL << order)) {
    2030           0 :                         spin_unlock(&vb->lock);
    2031           0 :                         continue;
    2032             :                 }
    2033             : 
    2034           0 :                 pages_off = VMAP_BBMAP_BITS - vb->free;
    2035           0 :                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
    2036           0 :                 vb->free -= 1UL << order;
    2037           0 :                 if (vb->free == 0) {
    2038           0 :                         spin_lock(&vbq->lock);
    2039           0 :                         list_del_rcu(&vb->free_list);
    2040           0 :                         spin_unlock(&vbq->lock);
    2041             :                 }
    2042             : 
    2043           0 :                 spin_unlock(&vb->lock);
    2044             :                 break;
    2045             :         }
    2046             : 
    2047           0 :         put_cpu_var(vmap_block_queue);
    2048             :         rcu_read_unlock();
    2049             : 
    2050             :         /* Allocate new block if nothing was found */
    2051           0 :         if (!vaddr)
    2052           0 :                 vaddr = new_vmap_block(order, gfp_mask);
    2053             : 
    2054             :         return vaddr;
    2055             : }
    2056             : 
    2057           0 : static void vb_free(unsigned long addr, unsigned long size)
    2058             : {
    2059             :         unsigned long offset;
    2060             :         unsigned int order;
    2061             :         struct vmap_block *vb;
    2062             : 
    2063           0 :         BUG_ON(offset_in_page(size));
    2064           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2065             : 
    2066           0 :         flush_cache_vunmap(addr, addr + size);
    2067             : 
    2068           0 :         order = get_order(size);
    2069           0 :         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
    2070           0 :         vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
    2071             : 
    2072           0 :         vunmap_range_noflush(addr, addr + size);
    2073             : 
    2074             :         if (debug_pagealloc_enabled_static())
    2075             :                 flush_tlb_kernel_range(addr, addr + size);
    2076             : 
    2077           0 :         spin_lock(&vb->lock);
    2078             : 
    2079             :         /* Expand dirty range */
    2080           0 :         vb->dirty_min = min(vb->dirty_min, offset);
    2081           0 :         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
    2082             : 
    2083           0 :         vb->dirty += 1UL << order;
    2084           0 :         if (vb->dirty == VMAP_BBMAP_BITS) {
    2085           0 :                 BUG_ON(vb->free);
    2086           0 :                 spin_unlock(&vb->lock);
    2087           0 :                 free_vmap_block(vb);
    2088             :         } else
    2089           0 :                 spin_unlock(&vb->lock);
    2090           0 : }
    2091             : 
    2092           0 : static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
    2093             : {
    2094             :         int cpu;
    2095             : 
    2096           0 :         if (unlikely(!vmap_initialized))
    2097             :                 return;
    2098             : 
    2099             :         might_sleep();
    2100             : 
    2101           0 :         for_each_possible_cpu(cpu) {
    2102           0 :                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2103             :                 struct vmap_block *vb;
    2104             : 
    2105             :                 rcu_read_lock();
    2106           0 :                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2107           0 :                         spin_lock(&vb->lock);
    2108           0 :                         if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
    2109           0 :                                 unsigned long va_start = vb->va->va_start;
    2110             :                                 unsigned long s, e;
    2111             : 
    2112           0 :                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
    2113           0 :                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
    2114             : 
    2115           0 :                                 start = min(s, start);
    2116           0 :                                 end   = max(e, end);
    2117             : 
    2118           0 :                                 flush = 1;
    2119             :                         }
    2120           0 :                         spin_unlock(&vb->lock);
    2121             :                 }
    2122             :                 rcu_read_unlock();
    2123             :         }
    2124             : 
    2125           0 :         mutex_lock(&vmap_purge_lock);
    2126           0 :         purge_fragmented_blocks_allcpus();
    2127           0 :         if (!__purge_vmap_area_lazy(start, end) && flush)
    2128           0 :                 flush_tlb_kernel_range(start, end);
    2129           0 :         mutex_unlock(&vmap_purge_lock);
    2130             : }
    2131             : 
    2132             : /**
    2133             :  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
    2134             :  *
    2135             :  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
    2136             :  * to amortize TLB flushing overheads. What this means is that any page you
    2137             :  * have now, may, in a former life, have been mapped into kernel virtual
    2138             :  * address by the vmap layer and so there might be some CPUs with TLB entries
    2139             :  * still referencing that page (additional to the regular 1:1 kernel mapping).
    2140             :  *
    2141             :  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
    2142             :  * be sure that none of the pages we have control over will have any aliases
    2143             :  * from the vmap layer.
    2144             :  */
    2145           0 : void vm_unmap_aliases(void)
    2146             : {
    2147           0 :         unsigned long start = ULONG_MAX, end = 0;
    2148           0 :         int flush = 0;
    2149             : 
    2150           0 :         _vm_unmap_aliases(start, end, flush);
    2151           0 : }
    2152             : EXPORT_SYMBOL_GPL(vm_unmap_aliases);
    2153             : 
    2154             : /**
    2155             :  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
    2156             :  * @mem: the pointer returned by vm_map_ram
    2157             :  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
    2158             :  */
    2159           0 : void vm_unmap_ram(const void *mem, unsigned int count)
    2160             : {
    2161           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2162           0 :         unsigned long addr = (unsigned long)kasan_reset_tag(mem);
    2163             :         struct vmap_area *va;
    2164             : 
    2165             :         might_sleep();
    2166           0 :         BUG_ON(!addr);
    2167           0 :         BUG_ON(addr < VMALLOC_START);
    2168           0 :         BUG_ON(addr > VMALLOC_END);
    2169           0 :         BUG_ON(!PAGE_ALIGNED(addr));
    2170             : 
    2171           0 :         kasan_poison_vmalloc(mem, size);
    2172             : 
    2173           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2174           0 :                 debug_check_no_locks_freed(mem, size);
    2175           0 :                 vb_free(addr, size);
    2176           0 :                 return;
    2177             :         }
    2178             : 
    2179           0 :         va = find_vmap_area(addr);
    2180           0 :         BUG_ON(!va);
    2181           0 :         debug_check_no_locks_freed((void *)va->va_start,
    2182           0 :                                     (va->va_end - va->va_start));
    2183           0 :         free_unmap_vmap_area(va);
    2184             : }
    2185             : EXPORT_SYMBOL(vm_unmap_ram);
    2186             : 
    2187             : /**
    2188             :  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
    2189             :  * @pages: an array of pointers to the pages to be mapped
    2190             :  * @count: number of pages
    2191             :  * @node: prefer to allocate data structures on this node
    2192             :  *
    2193             :  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
    2194             :  * faster than vmap so it's good.  But if you mix long-life and short-life
    2195             :  * objects with vm_map_ram(), it could consume lots of address space through
    2196             :  * fragmentation (especially on a 32bit machine).  You could see failures in
    2197             :  * the end.  Please use this function for short-lived objects.
    2198             :  *
    2199             :  * Returns: a pointer to the address that has been mapped, or %NULL on failure
    2200             :  */
    2201           0 : void *vm_map_ram(struct page **pages, unsigned int count, int node)
    2202             : {
    2203           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2204             :         unsigned long addr;
    2205             :         void *mem;
    2206             : 
    2207           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2208           0 :                 mem = vb_alloc(size, GFP_KERNEL);
    2209           0 :                 if (IS_ERR(mem))
    2210             :                         return NULL;
    2211             :                 addr = (unsigned long)mem;
    2212             :         } else {
    2213             :                 struct vmap_area *va;
    2214           0 :                 va = alloc_vmap_area(size, PAGE_SIZE,
    2215           0 :                                 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
    2216           0 :                 if (IS_ERR(va))
    2217             :                         return NULL;
    2218             : 
    2219           0 :                 addr = va->va_start;
    2220           0 :                 mem = (void *)addr;
    2221             :         }
    2222             : 
    2223           0 :         if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
    2224             :                                 pages, PAGE_SHIFT) < 0) {
    2225           0 :                 vm_unmap_ram(mem, count);
    2226           0 :                 return NULL;
    2227             :         }
    2228             : 
    2229             :         /*
    2230             :          * Mark the pages as accessible, now that they are mapped.
    2231             :          * With hardware tag-based KASAN, marking is skipped for
    2232             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2233             :          */
    2234             :         mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
    2235             : 
    2236             :         return mem;
    2237             : }
    2238             : EXPORT_SYMBOL(vm_map_ram);
    2239             : 
    2240             : static struct vm_struct *vmlist __initdata;
    2241             : 
    2242             : static inline unsigned int vm_area_page_order(struct vm_struct *vm)
    2243             : {
    2244             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2245             :         return vm->page_order;
    2246             : #else
    2247             :         return 0;
    2248             : #endif
    2249             : }
    2250             : 
    2251          15 : static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
    2252             : {
    2253             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2254             :         vm->page_order = order;
    2255             : #else
    2256          15 :         BUG_ON(order != 0);
    2257             : #endif
    2258          15 : }
    2259             : 
    2260             : /**
    2261             :  * vm_area_add_early - add vmap area early during boot
    2262             :  * @vm: vm_struct to add
    2263             :  *
    2264             :  * This function is used to add fixed kernel vm area to vmlist before
    2265             :  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
    2266             :  * should contain proper values and the other fields should be zero.
    2267             :  *
    2268             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2269             :  */
    2270           0 : void __init vm_area_add_early(struct vm_struct *vm)
    2271             : {
    2272             :         struct vm_struct *tmp, **p;
    2273             : 
    2274           0 :         BUG_ON(vmap_initialized);
    2275           0 :         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
    2276           0 :                 if (tmp->addr >= vm->addr) {
    2277           0 :                         BUG_ON(tmp->addr < vm->addr + vm->size);
    2278             :                         break;
    2279             :                 } else
    2280           0 :                         BUG_ON(tmp->addr + tmp->size > vm->addr);
    2281             :         }
    2282           0 :         vm->next = *p;
    2283           0 :         *p = vm;
    2284           0 : }
    2285             : 
    2286             : /**
    2287             :  * vm_area_register_early - register vmap area early during boot
    2288             :  * @vm: vm_struct to register
    2289             :  * @align: requested alignment
    2290             :  *
    2291             :  * This function is used to register kernel vm area before
    2292             :  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
    2293             :  * proper values on entry and other fields should be zero.  On return,
    2294             :  * vm->addr contains the allocated address.
    2295             :  *
    2296             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2297             :  */
    2298           0 : void __init vm_area_register_early(struct vm_struct *vm, size_t align)
    2299             : {
    2300           0 :         unsigned long addr = ALIGN(VMALLOC_START, align);
    2301             :         struct vm_struct *cur, **p;
    2302             : 
    2303           0 :         BUG_ON(vmap_initialized);
    2304             : 
    2305           0 :         for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
    2306           0 :                 if ((unsigned long)cur->addr - addr >= vm->size)
    2307             :                         break;
    2308           0 :                 addr = ALIGN((unsigned long)cur->addr + cur->size, align);
    2309             :         }
    2310             : 
    2311           0 :         BUG_ON(addr > VMALLOC_END - vm->size);
    2312           0 :         vm->addr = (void *)addr;
    2313           0 :         vm->next = *p;
    2314           0 :         *p = vm;
    2315           0 :         kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
    2316           0 : }
    2317             : 
    2318           1 : static void vmap_init_free_space(void)
    2319             : {
    2320           1 :         unsigned long vmap_start = 1;
    2321           1 :         const unsigned long vmap_end = ULONG_MAX;
    2322             :         struct vmap_area *busy, *free;
    2323             : 
    2324             :         /*
    2325             :          *     B     F     B     B     B     F
    2326             :          * -|-----|.....|-----|-----|-----|.....|-
    2327             :          *  |           The KVA space           |
    2328             :          *  |<--------------------------------->|
    2329             :          */
    2330           1 :         list_for_each_entry(busy, &vmap_area_list, list) {
    2331           0 :                 if (busy->va_start - vmap_start > 0) {
    2332           0 :                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2333           0 :                         if (!WARN_ON_ONCE(!free)) {
    2334           0 :                                 free->va_start = vmap_start;
    2335           0 :                                 free->va_end = busy->va_start;
    2336             : 
    2337           0 :                                 insert_vmap_area_augment(free, NULL,
    2338             :                                         &free_vmap_area_root,
    2339             :                                                 &free_vmap_area_list);
    2340             :                         }
    2341             :                 }
    2342             : 
    2343           0 :                 vmap_start = busy->va_end;
    2344             :         }
    2345             : 
    2346           1 :         if (vmap_end - vmap_start > 0) {
    2347           2 :                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2348           1 :                 if (!WARN_ON_ONCE(!free)) {
    2349           1 :                         free->va_start = vmap_start;
    2350           1 :                         free->va_end = vmap_end;
    2351             : 
    2352           1 :                         insert_vmap_area_augment(free, NULL,
    2353             :                                 &free_vmap_area_root,
    2354             :                                         &free_vmap_area_list);
    2355             :                 }
    2356             :         }
    2357           1 : }
    2358             : 
    2359           1 : void __init vmalloc_init(void)
    2360             : {
    2361             :         struct vmap_area *va;
    2362             :         struct vm_struct *tmp;
    2363             :         int i;
    2364             : 
    2365             :         /*
    2366             :          * Create the cache for vmap_area objects.
    2367             :          */
    2368           1 :         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
    2369             : 
    2370           2 :         for_each_possible_cpu(i) {
    2371             :                 struct vmap_block_queue *vbq;
    2372             :                 struct vfree_deferred *p;
    2373             : 
    2374           1 :                 vbq = &per_cpu(vmap_block_queue, i);
    2375           1 :                 spin_lock_init(&vbq->lock);
    2376           2 :                 INIT_LIST_HEAD(&vbq->free);
    2377           1 :                 p = &per_cpu(vfree_deferred, i);
    2378           2 :                 init_llist_head(&p->list);
    2379           2 :                 INIT_WORK(&p->wq, free_work);
    2380             :         }
    2381             : 
    2382             :         /* Import existing vmlist entries. */
    2383           1 :         for (tmp = vmlist; tmp; tmp = tmp->next) {
    2384           0 :                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2385           0 :                 if (WARN_ON_ONCE(!va))
    2386           0 :                         continue;
    2387             : 
    2388           0 :                 va->va_start = (unsigned long)tmp->addr;
    2389           0 :                 va->va_end = va->va_start + tmp->size;
    2390           0 :                 va->vm = tmp;
    2391           0 :                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    2392             :         }
    2393             : 
    2394             :         /*
    2395             :          * Now we can initialize a free vmap space.
    2396             :          */
    2397           1 :         vmap_init_free_space();
    2398           1 :         vmap_initialized = true;
    2399           1 : }
    2400             : 
    2401             : static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
    2402             :         struct vmap_area *va, unsigned long flags, const void *caller)
    2403             : {
    2404          15 :         vm->flags = flags;
    2405          15 :         vm->addr = (void *)va->va_start;
    2406          15 :         vm->size = va->va_end - va->va_start;
    2407          15 :         vm->caller = caller;
    2408          15 :         va->vm = vm;
    2409             : }
    2410             : 
    2411             : static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
    2412             :                               unsigned long flags, const void *caller)
    2413             : {
    2414          15 :         spin_lock(&vmap_area_lock);
    2415          15 :         setup_vmalloc_vm_locked(vm, va, flags, caller);
    2416          15 :         spin_unlock(&vmap_area_lock);
    2417             : }
    2418             : 
    2419             : static void clear_vm_uninitialized_flag(struct vm_struct *vm)
    2420             : {
    2421             :         /*
    2422             :          * Before removing VM_UNINITIALIZED,
    2423             :          * we should make sure that vm has proper values.
    2424             :          * Pair with smp_rmb() in show_numa_info().
    2425             :          */
    2426          15 :         smp_wmb();
    2427          15 :         vm->flags &= ~VM_UNINITIALIZED;
    2428             : }
    2429             : 
    2430          15 : static struct vm_struct *__get_vm_area_node(unsigned long size,
    2431             :                 unsigned long align, unsigned long shift, unsigned long flags,
    2432             :                 unsigned long start, unsigned long end, int node,
    2433             :                 gfp_t gfp_mask, const void *caller)
    2434             : {
    2435             :         struct vmap_area *va;
    2436             :         struct vm_struct *area;
    2437          15 :         unsigned long requested_size = size;
    2438             : 
    2439          15 :         BUG_ON(in_interrupt());
    2440          15 :         size = ALIGN(size, 1ul << shift);
    2441          15 :         if (unlikely(!size))
    2442             :                 return NULL;
    2443             : 
    2444          15 :         if (flags & VM_IOREMAP)
    2445           0 :                 align = 1ul << clamp_t(int, get_count_order_long(size),
    2446             :                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
    2447             : 
    2448          15 :         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    2449          15 :         if (unlikely(!area))
    2450             :                 return NULL;
    2451             : 
    2452          15 :         if (!(flags & VM_NO_GUARD))
    2453          15 :                 size += PAGE_SIZE;
    2454             : 
    2455          15 :         va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
    2456          15 :         if (IS_ERR(va)) {
    2457           0 :                 kfree(area);
    2458             :                 return NULL;
    2459             :         }
    2460             : 
    2461          15 :         setup_vmalloc_vm(area, va, flags, caller);
    2462             : 
    2463             :         /*
    2464             :          * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
    2465             :          * best-effort approach, as they can be mapped outside of vmalloc code.
    2466             :          * For VM_ALLOC mappings, the pages are marked as accessible after
    2467             :          * getting mapped in __vmalloc_node_range().
    2468             :          * With hardware tag-based KASAN, marking is skipped for
    2469             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2470             :          */
    2471          15 :         if (!(flags & VM_ALLOC))
    2472             :                 area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
    2473             :                                                     KASAN_VMALLOC_PROT_NORMAL);
    2474             : 
    2475             :         return area;
    2476             : }
    2477             : 
    2478           0 : struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
    2479             :                                        unsigned long start, unsigned long end,
    2480             :                                        const void *caller)
    2481             : {
    2482           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
    2483             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2484             : }
    2485             : 
    2486             : /**
    2487             :  * get_vm_area - reserve a contiguous kernel virtual area
    2488             :  * @size:        size of the area
    2489             :  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
    2490             :  *
    2491             :  * Search an area of @size in the kernel virtual mapping area,
    2492             :  * and reserved it for out purposes.  Returns the area descriptor
    2493             :  * on success or %NULL on failure.
    2494             :  *
    2495             :  * Return: the area descriptor on success or %NULL on failure.
    2496             :  */
    2497           0 : struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
    2498             : {
    2499           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2500           0 :                                   VMALLOC_START, VMALLOC_END,
    2501             :                                   NUMA_NO_NODE, GFP_KERNEL,
    2502           0 :                                   __builtin_return_address(0));
    2503             : }
    2504             : 
    2505           0 : struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
    2506             :                                 const void *caller)
    2507             : {
    2508           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2509           0 :                                   VMALLOC_START, VMALLOC_END,
    2510             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2511             : }
    2512             : 
    2513             : /**
    2514             :  * find_vm_area - find a continuous kernel virtual area
    2515             :  * @addr:         base address
    2516             :  *
    2517             :  * Search for the kernel VM area starting at @addr, and return it.
    2518             :  * It is up to the caller to do all required locking to keep the returned
    2519             :  * pointer valid.
    2520             :  *
    2521             :  * Return: the area descriptor on success or %NULL on failure.
    2522             :  */
    2523          15 : struct vm_struct *find_vm_area(const void *addr)
    2524             : {
    2525             :         struct vmap_area *va;
    2526             : 
    2527          30 :         va = find_vmap_area((unsigned long)addr);
    2528          15 :         if (!va)
    2529             :                 return NULL;
    2530             : 
    2531          15 :         return va->vm;
    2532             : }
    2533             : 
    2534             : /**
    2535             :  * remove_vm_area - find and remove a continuous kernel virtual area
    2536             :  * @addr:           base address
    2537             :  *
    2538             :  * Search for the kernel VM area starting at @addr, and remove it.
    2539             :  * This function returns the found VM area, but using it is NOT safe
    2540             :  * on SMP machines, except for its size or flags.
    2541             :  *
    2542             :  * Return: the area descriptor on success or %NULL on failure.
    2543             :  */
    2544           0 : struct vm_struct *remove_vm_area(const void *addr)
    2545             : {
    2546             :         struct vmap_area *va;
    2547             : 
    2548             :         might_sleep();
    2549             : 
    2550           0 :         spin_lock(&vmap_area_lock);
    2551           0 :         va = __find_vmap_area((unsigned long)addr);
    2552           0 :         if (va && va->vm) {
    2553           0 :                 struct vm_struct *vm = va->vm;
    2554             : 
    2555           0 :                 va->vm = NULL;
    2556           0 :                 spin_unlock(&vmap_area_lock);
    2557             : 
    2558           0 :                 kasan_free_module_shadow(vm);
    2559           0 :                 free_unmap_vmap_area(va);
    2560             : 
    2561           0 :                 return vm;
    2562             :         }
    2563             : 
    2564           0 :         spin_unlock(&vmap_area_lock);
    2565           0 :         return NULL;
    2566             : }
    2567             : 
    2568           0 : static inline void set_area_direct_map(const struct vm_struct *area,
    2569             :                                        int (*set_direct_map)(struct page *page))
    2570             : {
    2571             :         int i;
    2572             : 
    2573             :         /* HUGE_VMALLOC passes small pages to set_direct_map */
    2574           0 :         for (i = 0; i < area->nr_pages; i++)
    2575           0 :                 if (page_address(area->pages[i]))
    2576           0 :                         set_direct_map(area->pages[i]);
    2577           0 : }
    2578             : 
    2579             : /* Handle removing and resetting vm mappings related to the vm_struct. */
    2580           0 : static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
    2581             : {
    2582           0 :         unsigned long start = ULONG_MAX, end = 0;
    2583           0 :         unsigned int page_order = vm_area_page_order(area);
    2584           0 :         int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
    2585           0 :         int flush_dmap = 0;
    2586             :         int i;
    2587             : 
    2588           0 :         remove_vm_area(area->addr);
    2589             : 
    2590             :         /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
    2591           0 :         if (!flush_reset)
    2592             :                 return;
    2593             : 
    2594             :         /*
    2595             :          * If not deallocating pages, just do the flush of the VM area and
    2596             :          * return.
    2597             :          */
    2598           0 :         if (!deallocate_pages) {
    2599             :                 vm_unmap_aliases();
    2600             :                 return;
    2601             :         }
    2602             : 
    2603             :         /*
    2604             :          * If execution gets here, flush the vm mapping and reset the direct
    2605             :          * map. Find the start and end range of the direct mappings to make sure
    2606             :          * the vm_unmap_aliases() flush includes the direct map.
    2607             :          */
    2608           0 :         for (i = 0; i < area->nr_pages; i += 1U << page_order) {
    2609           0 :                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
    2610           0 :                 if (addr) {
    2611             :                         unsigned long page_size;
    2612             : 
    2613           0 :                         page_size = PAGE_SIZE << page_order;
    2614           0 :                         start = min(addr, start);
    2615           0 :                         end = max(addr + page_size, end);
    2616           0 :                         flush_dmap = 1;
    2617             :                 }
    2618             :         }
    2619             : 
    2620             :         /*
    2621             :          * Set direct map to something invalid so that it won't be cached if
    2622             :          * there are any accesses after the TLB flush, then flush the TLB and
    2623             :          * reset the direct map permissions to the default.
    2624             :          */
    2625           0 :         set_area_direct_map(area, set_direct_map_invalid_noflush);
    2626           0 :         _vm_unmap_aliases(start, end, flush_dmap);
    2627           0 :         set_area_direct_map(area, set_direct_map_default_noflush);
    2628             : }
    2629             : 
    2630           0 : static void __vunmap(const void *addr, int deallocate_pages)
    2631             : {
    2632             :         struct vm_struct *area;
    2633             : 
    2634           0 :         if (!addr)
    2635             :                 return;
    2636             : 
    2637           0 :         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
    2638             :                         addr))
    2639             :                 return;
    2640             : 
    2641           0 :         area = find_vm_area(addr);
    2642           0 :         if (unlikely(!area)) {
    2643           0 :                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
    2644             :                                 addr);
    2645           0 :                 return;
    2646             :         }
    2647             : 
    2648           0 :         debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
    2649           0 :         debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
    2650             : 
    2651           0 :         kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
    2652             : 
    2653           0 :         vm_remove_mappings(area, deallocate_pages);
    2654             : 
    2655           0 :         if (deallocate_pages) {
    2656             :                 int i;
    2657             : 
    2658           0 :                 for (i = 0; i < area->nr_pages; i++) {
    2659           0 :                         struct page *page = area->pages[i];
    2660             : 
    2661           0 :                         BUG_ON(!page);
    2662           0 :                         mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
    2663             :                         /*
    2664             :                          * High-order allocs for huge vmallocs are split, so
    2665             :                          * can be freed as an array of order-0 allocations
    2666             :                          */
    2667           0 :                         __free_pages(page, 0);
    2668           0 :                         cond_resched();
    2669             :                 }
    2670           0 :                 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
    2671             : 
    2672           0 :                 kvfree(area->pages);
    2673             :         }
    2674             : 
    2675           0 :         kfree(area);
    2676             : }
    2677             : 
    2678           0 : static inline void __vfree_deferred(const void *addr)
    2679             : {
    2680             :         /*
    2681             :          * Use raw_cpu_ptr() because this can be called from preemptible
    2682             :          * context. Preemption is absolutely fine here, because the llist_add()
    2683             :          * implementation is lockless, so it works even if we are adding to
    2684             :          * another cpu's list. schedule_work() should be fine with this too.
    2685             :          */
    2686           0 :         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
    2687             : 
    2688           0 :         if (llist_add((struct llist_node *)addr, &p->list))
    2689           0 :                 schedule_work(&p->wq);
    2690           0 : }
    2691             : 
    2692             : /**
    2693             :  * vfree_atomic - release memory allocated by vmalloc()
    2694             :  * @addr:         memory base address
    2695             :  *
    2696             :  * This one is just like vfree() but can be called in any atomic context
    2697             :  * except NMIs.
    2698             :  */
    2699           0 : void vfree_atomic(const void *addr)
    2700             : {
    2701           0 :         BUG_ON(in_nmi());
    2702             : 
    2703           0 :         kmemleak_free(addr);
    2704             : 
    2705           0 :         if (!addr)
    2706             :                 return;
    2707           0 :         __vfree_deferred(addr);
    2708             : }
    2709             : 
    2710           0 : static void __vfree(const void *addr)
    2711             : {
    2712           0 :         if (unlikely(in_interrupt()))
    2713           0 :                 __vfree_deferred(addr);
    2714             :         else
    2715           0 :                 __vunmap(addr, 1);
    2716           0 : }
    2717             : 
    2718             : /**
    2719             :  * vfree - Release memory allocated by vmalloc()
    2720             :  * @addr:  Memory base address
    2721             :  *
    2722             :  * Free the virtually continuous memory area starting at @addr, as obtained
    2723             :  * from one of the vmalloc() family of APIs.  This will usually also free the
    2724             :  * physical memory underlying the virtual allocation, but that memory is
    2725             :  * reference counted, so it will not be freed until the last user goes away.
    2726             :  *
    2727             :  * If @addr is NULL, no operation is performed.
    2728             :  *
    2729             :  * Context:
    2730             :  * May sleep if called *not* from interrupt context.
    2731             :  * Must not be called in NMI context (strictly speaking, it could be
    2732             :  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
    2733             :  * conventions for vfree() arch-dependent would be a really bad idea).
    2734             :  */
    2735           0 : void vfree(const void *addr)
    2736             : {
    2737           0 :         BUG_ON(in_nmi());
    2738             : 
    2739           0 :         kmemleak_free(addr);
    2740             : 
    2741             :         might_sleep_if(!in_interrupt());
    2742             : 
    2743           0 :         if (!addr)
    2744             :                 return;
    2745             : 
    2746           0 :         __vfree(addr);
    2747             : }
    2748             : EXPORT_SYMBOL(vfree);
    2749             : 
    2750             : /**
    2751             :  * vunmap - release virtual mapping obtained by vmap()
    2752             :  * @addr:   memory base address
    2753             :  *
    2754             :  * Free the virtually contiguous memory area starting at @addr,
    2755             :  * which was created from the page array passed to vmap().
    2756             :  *
    2757             :  * Must not be called in interrupt context.
    2758             :  */
    2759           0 : void vunmap(const void *addr)
    2760             : {
    2761           0 :         BUG_ON(in_interrupt());
    2762             :         might_sleep();
    2763           0 :         if (addr)
    2764           0 :                 __vunmap(addr, 0);
    2765           0 : }
    2766             : EXPORT_SYMBOL(vunmap);
    2767             : 
    2768             : /**
    2769             :  * vmap - map an array of pages into virtually contiguous space
    2770             :  * @pages: array of page pointers
    2771             :  * @count: number of pages to map
    2772             :  * @flags: vm_area->flags
    2773             :  * @prot: page protection for the mapping
    2774             :  *
    2775             :  * Maps @count pages from @pages into contiguous kernel virtual space.
    2776             :  * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
    2777             :  * (which must be kmalloc or vmalloc memory) and one reference per pages in it
    2778             :  * are transferred from the caller to vmap(), and will be freed / dropped when
    2779             :  * vfree() is called on the return value.
    2780             :  *
    2781             :  * Return: the address of the area or %NULL on failure
    2782             :  */
    2783           0 : void *vmap(struct page **pages, unsigned int count,
    2784             :            unsigned long flags, pgprot_t prot)
    2785             : {
    2786             :         struct vm_struct *area;
    2787             :         unsigned long addr;
    2788             :         unsigned long size;             /* In bytes */
    2789             : 
    2790             :         might_sleep();
    2791             : 
    2792             :         /*
    2793             :          * Your top guard is someone else's bottom guard. Not having a top
    2794             :          * guard compromises someone else's mappings too.
    2795             :          */
    2796           0 :         if (WARN_ON_ONCE(flags & VM_NO_GUARD))
    2797           0 :                 flags &= ~VM_NO_GUARD;
    2798             : 
    2799           0 :         if (count > totalram_pages())
    2800             :                 return NULL;
    2801             : 
    2802           0 :         size = (unsigned long)count << PAGE_SHIFT;
    2803           0 :         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
    2804           0 :         if (!area)
    2805             :                 return NULL;
    2806             : 
    2807           0 :         addr = (unsigned long)area->addr;
    2808           0 :         if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
    2809             :                                 pages, PAGE_SHIFT) < 0) {
    2810           0 :                 vunmap(area->addr);
    2811           0 :                 return NULL;
    2812             :         }
    2813             : 
    2814           0 :         if (flags & VM_MAP_PUT_PAGES) {
    2815           0 :                 area->pages = pages;
    2816           0 :                 area->nr_pages = count;
    2817             :         }
    2818           0 :         return area->addr;
    2819             : }
    2820             : EXPORT_SYMBOL(vmap);
    2821             : 
    2822             : #ifdef CONFIG_VMAP_PFN
    2823             : struct vmap_pfn_data {
    2824             :         unsigned long   *pfns;
    2825             :         pgprot_t        prot;
    2826             :         unsigned int    idx;
    2827             : };
    2828             : 
    2829             : static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
    2830             : {
    2831             :         struct vmap_pfn_data *data = private;
    2832             : 
    2833             :         if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
    2834             :                 return -EINVAL;
    2835             :         *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
    2836             :         return 0;
    2837             : }
    2838             : 
    2839             : /**
    2840             :  * vmap_pfn - map an array of PFNs into virtually contiguous space
    2841             :  * @pfns: array of PFNs
    2842             :  * @count: number of pages to map
    2843             :  * @prot: page protection for the mapping
    2844             :  *
    2845             :  * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
    2846             :  * the start address of the mapping.
    2847             :  */
    2848             : void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
    2849             : {
    2850             :         struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
    2851             :         struct vm_struct *area;
    2852             : 
    2853             :         area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
    2854             :                         __builtin_return_address(0));
    2855             :         if (!area)
    2856             :                 return NULL;
    2857             :         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
    2858             :                         count * PAGE_SIZE, vmap_pfn_apply, &data)) {
    2859             :                 free_vm_area(area);
    2860             :                 return NULL;
    2861             :         }
    2862             :         return area->addr;
    2863             : }
    2864             : EXPORT_SYMBOL_GPL(vmap_pfn);
    2865             : #endif /* CONFIG_VMAP_PFN */
    2866             : 
    2867             : static inline unsigned int
    2868          15 : vm_area_alloc_pages(gfp_t gfp, int nid,
    2869             :                 unsigned int order, unsigned int nr_pages, struct page **pages)
    2870             : {
    2871          15 :         unsigned int nr_allocated = 0;
    2872             :         struct page *page;
    2873             :         int i;
    2874             : 
    2875             :         /*
    2876             :          * For order-0 pages we make use of bulk allocator, if
    2877             :          * the page array is partly or not at all populated due
    2878             :          * to fails, fallback to a single page allocator that is
    2879             :          * more permissive.
    2880             :          */
    2881          15 :         if (!order) {
    2882          15 :                 gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
    2883             : 
    2884          45 :                 while (nr_allocated < nr_pages) {
    2885             :                         unsigned int nr, nr_pages_request;
    2886             : 
    2887             :                         /*
    2888             :                          * A maximum allowed request is hard-coded and is 100
    2889             :                          * pages per call. That is done in order to prevent a
    2890             :                          * long preemption off scenario in the bulk-allocator
    2891             :                          * so the range is [1:100].
    2892             :                          */
    2893          15 :                         nr_pages_request = min(100U, nr_pages - nr_allocated);
    2894             : 
    2895             :                         /* memory allocation should consider mempolicy, we can't
    2896             :                          * wrongly use nearest node when nid == NUMA_NO_NODE,
    2897             :                          * otherwise memory may be allocated in only one node,
    2898             :                          * but mempolcy want to alloc memory by interleaving.
    2899             :                          */
    2900             :                         if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
    2901             :                                 nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
    2902             :                                                         nr_pages_request,
    2903             :                                                         pages + nr_allocated);
    2904             : 
    2905             :                         else
    2906          30 :                                 nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
    2907             :                                                         nr_pages_request,
    2908          15 :                                                         pages + nr_allocated);
    2909             : 
    2910          15 :                         nr_allocated += nr;
    2911          15 :                         cond_resched();
    2912             : 
    2913             :                         /*
    2914             :                          * If zero or pages were obtained partly,
    2915             :                          * fallback to a single page allocator.
    2916             :                          */
    2917          15 :                         if (nr != nr_pages_request)
    2918             :                                 break;
    2919             :                 }
    2920             :         }
    2921             : 
    2922             :         /* High-order pages or fallback path if "bulk" fails. */
    2923             : 
    2924          15 :         while (nr_allocated < nr_pages) {
    2925           0 :                 if (fatal_signal_pending(current))
    2926             :                         break;
    2927             : 
    2928           0 :                 if (nid == NUMA_NO_NODE)
    2929           0 :                         page = alloc_pages(gfp, order);
    2930             :                 else
    2931           0 :                         page = alloc_pages_node(nid, gfp, order);
    2932           0 :                 if (unlikely(!page))
    2933             :                         break;
    2934             :                 /*
    2935             :                  * Higher order allocations must be able to be treated as
    2936             :                  * indepdenent small pages by callers (as they can with
    2937             :                  * small-page vmallocs). Some drivers do their own refcounting
    2938             :                  * on vmalloc_to_page() pages, some use page->mapping,
    2939             :                  * page->lru, etc.
    2940             :                  */
    2941           0 :                 if (order)
    2942           0 :                         split_page(page, order);
    2943             : 
    2944             :                 /*
    2945             :                  * Careful, we allocate and map page-order pages, but
    2946             :                  * tracking is done per PAGE_SIZE page so as to keep the
    2947             :                  * vm_struct APIs independent of the physical/mapped size.
    2948             :                  */
    2949           0 :                 for (i = 0; i < (1U << order); i++)
    2950           0 :                         pages[nr_allocated + i] = page + i;
    2951             : 
    2952           0 :                 cond_resched();
    2953           0 :                 nr_allocated += 1U << order;
    2954             :         }
    2955             : 
    2956          15 :         return nr_allocated;
    2957             : }
    2958             : 
    2959          15 : static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
    2960             :                                  pgprot_t prot, unsigned int page_shift,
    2961             :                                  int node)
    2962             : {
    2963          15 :         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    2964          15 :         bool nofail = gfp_mask & __GFP_NOFAIL;
    2965          15 :         unsigned long addr = (unsigned long)area->addr;
    2966          30 :         unsigned long size = get_vm_area_size(area);
    2967             :         unsigned long array_size;
    2968          15 :         unsigned int nr_small_pages = size >> PAGE_SHIFT;
    2969             :         unsigned int page_order;
    2970             :         unsigned int flags;
    2971             :         int ret;
    2972             : 
    2973          15 :         array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
    2974          15 :         gfp_mask |= __GFP_NOWARN;
    2975          15 :         if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
    2976          15 :                 gfp_mask |= __GFP_HIGHMEM;
    2977             : 
    2978             :         /* Please note that the recursion is strictly bounded. */
    2979          15 :         if (array_size > PAGE_SIZE) {
    2980           0 :                 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
    2981             :                                         area->caller);
    2982             :         } else {
    2983          15 :                 area->pages = kmalloc_node(array_size, nested_gfp, node);
    2984             :         }
    2985             : 
    2986          15 :         if (!area->pages) {
    2987           0 :                 warn_alloc(gfp_mask, NULL,
    2988             :                         "vmalloc error: size %lu, failed to allocated page array size %lu",
    2989             :                         nr_small_pages * PAGE_SIZE, array_size);
    2990           0 :                 free_vm_area(area);
    2991           0 :                 return NULL;
    2992             :         }
    2993             : 
    2994          15 :         set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
    2995          15 :         page_order = vm_area_page_order(area);
    2996             : 
    2997          15 :         area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
    2998             :                 node, page_order, nr_small_pages, area->pages);
    2999             : 
    3000          30 :         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
    3001          15 :         if (gfp_mask & __GFP_ACCOUNT) {
    3002             :                 int i;
    3003             : 
    3004           0 :                 for (i = 0; i < area->nr_pages; i++)
    3005           0 :                         mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
    3006             :         }
    3007             : 
    3008             :         /*
    3009             :          * If not enough pages were obtained to accomplish an
    3010             :          * allocation request, free them via __vfree() if any.
    3011             :          */
    3012          15 :         if (area->nr_pages != nr_small_pages) {
    3013           0 :                 warn_alloc(gfp_mask, NULL,
    3014             :                         "vmalloc error: size %lu, page order %u, failed to allocate pages",
    3015           0 :                         area->nr_pages * PAGE_SIZE, page_order);
    3016           0 :                 goto fail;
    3017             :         }
    3018             : 
    3019             :         /*
    3020             :          * page tables allocations ignore external gfp mask, enforce it
    3021             :          * by the scope API
    3022             :          */
    3023          15 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3024           0 :                 flags = memalloc_nofs_save();
    3025          15 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3026           0 :                 flags = memalloc_noio_save();
    3027             : 
    3028             :         do {
    3029          30 :                 ret = vmap_pages_range(addr, addr + size, prot, area->pages,
    3030             :                         page_shift);
    3031          15 :                 if (nofail && (ret < 0))
    3032           0 :                         schedule_timeout_uninterruptible(1);
    3033          15 :         } while (nofail && (ret < 0));
    3034             : 
    3035          15 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3036             :                 memalloc_nofs_restore(flags);
    3037          15 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3038             :                 memalloc_noio_restore(flags);
    3039             : 
    3040          15 :         if (ret < 0) {
    3041           0 :                 warn_alloc(gfp_mask, NULL,
    3042             :                         "vmalloc error: size %lu, failed to map pages",
    3043           0 :                         area->nr_pages * PAGE_SIZE);
    3044           0 :                 goto fail;
    3045             :         }
    3046             : 
    3047          15 :         return area->addr;
    3048             : 
    3049             : fail:
    3050           0 :         __vfree(area->addr);
    3051           0 :         return NULL;
    3052             : }
    3053             : 
    3054             : /**
    3055             :  * __vmalloc_node_range - allocate virtually contiguous memory
    3056             :  * @size:                 allocation size
    3057             :  * @align:                desired alignment
    3058             :  * @start:                vm area range start
    3059             :  * @end:                  vm area range end
    3060             :  * @gfp_mask:             flags for the page level allocator
    3061             :  * @prot:                 protection mask for the allocated pages
    3062             :  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
    3063             :  * @node:                 node to use for allocation or NUMA_NO_NODE
    3064             :  * @caller:               caller's return address
    3065             :  *
    3066             :  * Allocate enough pages to cover @size from the page level
    3067             :  * allocator with @gfp_mask flags. Please note that the full set of gfp
    3068             :  * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
    3069             :  * supported.
    3070             :  * Zone modifiers are not supported. From the reclaim modifiers
    3071             :  * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
    3072             :  * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
    3073             :  * __GFP_RETRY_MAYFAIL are not supported).
    3074             :  *
    3075             :  * __GFP_NOWARN can be used to suppress failures messages.
    3076             :  *
    3077             :  * Map them into contiguous kernel virtual space, using a pagetable
    3078             :  * protection of @prot.
    3079             :  *
    3080             :  * Return: the address of the area or %NULL on failure
    3081             :  */
    3082          15 : void *__vmalloc_node_range(unsigned long size, unsigned long align,
    3083             :                         unsigned long start, unsigned long end, gfp_t gfp_mask,
    3084             :                         pgprot_t prot, unsigned long vm_flags, int node,
    3085             :                         const void *caller)
    3086             : {
    3087             :         struct vm_struct *area;
    3088             :         void *ret;
    3089          15 :         kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
    3090          15 :         unsigned long real_size = size;
    3091          15 :         unsigned long real_align = align;
    3092          15 :         unsigned int shift = PAGE_SHIFT;
    3093             : 
    3094          15 :         if (WARN_ON_ONCE(!size))
    3095             :                 return NULL;
    3096             : 
    3097          30 :         if ((size >> PAGE_SHIFT) > totalram_pages()) {
    3098           0 :                 warn_alloc(gfp_mask, NULL,
    3099             :                         "vmalloc error: size %lu, exceeds total pages",
    3100             :                         real_size);
    3101           0 :                 return NULL;
    3102             :         }
    3103             : 
    3104             :         if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
    3105             :                 unsigned long size_per_node;
    3106             : 
    3107             :                 /*
    3108             :                  * Try huge pages. Only try for PAGE_KERNEL allocations,
    3109             :                  * others like modules don't yet expect huge pages in
    3110             :                  * their allocations due to apply_to_page_range not
    3111             :                  * supporting them.
    3112             :                  */
    3113             : 
    3114             :                 size_per_node = size;
    3115             :                 if (node == NUMA_NO_NODE)
    3116             :                         size_per_node /= num_online_nodes();
    3117             :                 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
    3118             :                         shift = PMD_SHIFT;
    3119             :                 else
    3120             :                         shift = arch_vmap_pte_supported_shift(size_per_node);
    3121             : 
    3122             :                 align = max(real_align, 1UL << shift);
    3123             :                 size = ALIGN(real_size, 1UL << shift);
    3124             :         }
    3125             : 
    3126             : again:
    3127          15 :         area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
    3128             :                                   VM_UNINITIALIZED | vm_flags, start, end, node,
    3129             :                                   gfp_mask, caller);
    3130          15 :         if (!area) {
    3131           0 :                 bool nofail = gfp_mask & __GFP_NOFAIL;
    3132           0 :                 warn_alloc(gfp_mask, NULL,
    3133             :                         "vmalloc error: size %lu, vm_struct allocation failed%s",
    3134             :                         real_size, (nofail) ? ". Retrying." : "");
    3135           0 :                 if (nofail) {
    3136           0 :                         schedule_timeout_uninterruptible(1);
    3137           0 :                         goto again;
    3138             :                 }
    3139             :                 goto fail;
    3140             :         }
    3141             : 
    3142             :         /*
    3143             :          * Prepare arguments for __vmalloc_area_node() and
    3144             :          * kasan_unpoison_vmalloc().
    3145             :          */
    3146          15 :         if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
    3147             :                 if (kasan_hw_tags_enabled()) {
    3148             :                         /*
    3149             :                          * Modify protection bits to allow tagging.
    3150             :                          * This must be done before mapping.
    3151             :                          */
    3152             :                         prot = arch_vmap_pgprot_tagged(prot);
    3153             : 
    3154             :                         /*
    3155             :                          * Skip page_alloc poisoning and zeroing for physical
    3156             :                          * pages backing VM_ALLOC mapping. Memory is instead
    3157             :                          * poisoned and zeroed by kasan_unpoison_vmalloc().
    3158             :                          */
    3159             :                         gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
    3160             :                 }
    3161             : 
    3162             :                 /* Take note that the mapping is PAGE_KERNEL. */
    3163             :                 kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
    3164             :         }
    3165             : 
    3166             :         /* Allocate physical pages and map them into vmalloc space. */
    3167          15 :         ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
    3168          15 :         if (!ret)
    3169             :                 goto fail;
    3170             : 
    3171             :         /*
    3172             :          * Mark the pages as accessible, now that they are mapped.
    3173             :          * The init condition should match the one in post_alloc_hook()
    3174             :          * (except for the should_skip_init() check) to make sure that memory
    3175             :          * is initialized under the same conditions regardless of the enabled
    3176             :          * KASAN mode.
    3177             :          * Tag-based KASAN modes only assign tags to normal non-executable
    3178             :          * allocations, see __kasan_unpoison_vmalloc().
    3179             :          */
    3180          15 :         kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
    3181          30 :         if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
    3182             :                 kasan_flags |= KASAN_VMALLOC_INIT;
    3183             :         /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
    3184          15 :         area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
    3185             : 
    3186             :         /*
    3187             :          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
    3188             :          * flag. It means that vm_struct is not fully initialized.
    3189             :          * Now, it is fully initialized, so remove this flag here.
    3190             :          */
    3191          15 :         clear_vm_uninitialized_flag(area);
    3192             : 
    3193          15 :         size = PAGE_ALIGN(size);
    3194             :         if (!(vm_flags & VM_DEFER_KMEMLEAK))
    3195          15 :                 kmemleak_vmalloc(area, size, gfp_mask);
    3196             : 
    3197          15 :         return area->addr;
    3198             : 
    3199             : fail:
    3200             :         if (shift > PAGE_SHIFT) {
    3201             :                 shift = PAGE_SHIFT;
    3202             :                 align = real_align;
    3203             :                 size = real_size;
    3204             :                 goto again;
    3205             :         }
    3206             : 
    3207             :         return NULL;
    3208             : }
    3209             : 
    3210             : /**
    3211             :  * __vmalloc_node - allocate virtually contiguous memory
    3212             :  * @size:           allocation size
    3213             :  * @align:          desired alignment
    3214             :  * @gfp_mask:       flags for the page level allocator
    3215             :  * @node:           node to use for allocation or NUMA_NO_NODE
    3216             :  * @caller:         caller's return address
    3217             :  *
    3218             :  * Allocate enough pages to cover @size from the page level allocator with
    3219             :  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
    3220             :  *
    3221             :  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
    3222             :  * and __GFP_NOFAIL are not supported
    3223             :  *
    3224             :  * Any use of gfp flags outside of GFP_KERNEL should be consulted
    3225             :  * with mm people.
    3226             :  *
    3227             :  * Return: pointer to the allocated memory or %NULL on error
    3228             :  */
    3229           0 : void *__vmalloc_node(unsigned long size, unsigned long align,
    3230             :                             gfp_t gfp_mask, int node, const void *caller)
    3231             : {
    3232           0 :         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
    3233           0 :                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
    3234             : }
    3235             : /*
    3236             :  * This is only for performance analysis of vmalloc and stress purpose.
    3237             :  * It is required by vmalloc test module, therefore do not use it other
    3238             :  * than that.
    3239             :  */
    3240             : #ifdef CONFIG_TEST_VMALLOC_MODULE
    3241             : EXPORT_SYMBOL_GPL(__vmalloc_node);
    3242             : #endif
    3243             : 
    3244           0 : void *__vmalloc(unsigned long size, gfp_t gfp_mask)
    3245             : {
    3246           0 :         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
    3247           0 :                                 __builtin_return_address(0));
    3248             : }
    3249             : EXPORT_SYMBOL(__vmalloc);
    3250             : 
    3251             : /**
    3252             :  * vmalloc - allocate virtually contiguous memory
    3253             :  * @size:    allocation size
    3254             :  *
    3255             :  * Allocate enough pages to cover @size from the page level
    3256             :  * allocator and map them into contiguous kernel virtual space.
    3257             :  *
    3258             :  * For tight control over page level allocator and protection flags
    3259             :  * use __vmalloc() instead.
    3260             :  *
    3261             :  * Return: pointer to the allocated memory or %NULL on error
    3262             :  */
    3263           0 : void *vmalloc(unsigned long size)
    3264             : {
    3265           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
    3266           0 :                                 __builtin_return_address(0));
    3267             : }
    3268             : EXPORT_SYMBOL(vmalloc);
    3269             : 
    3270             : /**
    3271             :  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
    3272             :  * @size:      allocation size
    3273             :  * @gfp_mask:  flags for the page level allocator
    3274             :  *
    3275             :  * Allocate enough pages to cover @size from the page level
    3276             :  * allocator and map them into contiguous kernel virtual space.
    3277             :  * If @size is greater than or equal to PMD_SIZE, allow using
    3278             :  * huge pages for the memory
    3279             :  *
    3280             :  * Return: pointer to the allocated memory or %NULL on error
    3281             :  */
    3282           0 : void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
    3283             : {
    3284           0 :         return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
    3285           0 :                                     gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
    3286           0 :                                     NUMA_NO_NODE, __builtin_return_address(0));
    3287             : }
    3288             : EXPORT_SYMBOL_GPL(vmalloc_huge);
    3289             : 
    3290             : /**
    3291             :  * vzalloc - allocate virtually contiguous memory with zero fill
    3292             :  * @size:    allocation size
    3293             :  *
    3294             :  * Allocate enough pages to cover @size from the page level
    3295             :  * allocator and map them into contiguous kernel virtual space.
    3296             :  * The memory allocated is set to zero.
    3297             :  *
    3298             :  * For tight control over page level allocator and protection flags
    3299             :  * use __vmalloc() instead.
    3300             :  *
    3301             :  * Return: pointer to the allocated memory or %NULL on error
    3302             :  */
    3303           0 : void *vzalloc(unsigned long size)
    3304             : {
    3305           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
    3306           0 :                                 __builtin_return_address(0));
    3307             : }
    3308             : EXPORT_SYMBOL(vzalloc);
    3309             : 
    3310             : /**
    3311             :  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
    3312             :  * @size: allocation size
    3313             :  *
    3314             :  * The resulting memory area is zeroed so it can be mapped to userspace
    3315             :  * without leaking data.
    3316             :  *
    3317             :  * Return: pointer to the allocated memory or %NULL on error
    3318             :  */
    3319           0 : void *vmalloc_user(unsigned long size)
    3320             : {
    3321           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3322           0 :                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
    3323             :                                     VM_USERMAP, NUMA_NO_NODE,
    3324           0 :                                     __builtin_return_address(0));
    3325             : }
    3326             : EXPORT_SYMBOL(vmalloc_user);
    3327             : 
    3328             : /**
    3329             :  * vmalloc_node - allocate memory on a specific node
    3330             :  * @size:         allocation size
    3331             :  * @node:         numa node
    3332             :  *
    3333             :  * Allocate enough pages to cover @size from the page level
    3334             :  * allocator and map them into contiguous kernel virtual space.
    3335             :  *
    3336             :  * For tight control over page level allocator and protection flags
    3337             :  * use __vmalloc() instead.
    3338             :  *
    3339             :  * Return: pointer to the allocated memory or %NULL on error
    3340             :  */
    3341           0 : void *vmalloc_node(unsigned long size, int node)
    3342             : {
    3343           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, node,
    3344           0 :                         __builtin_return_address(0));
    3345             : }
    3346             : EXPORT_SYMBOL(vmalloc_node);
    3347             : 
    3348             : /**
    3349             :  * vzalloc_node - allocate memory on a specific node with zero fill
    3350             :  * @size:       allocation size
    3351             :  * @node:       numa node
    3352             :  *
    3353             :  * Allocate enough pages to cover @size from the page level
    3354             :  * allocator and map them into contiguous kernel virtual space.
    3355             :  * The memory allocated is set to zero.
    3356             :  *
    3357             :  * Return: pointer to the allocated memory or %NULL on error
    3358             :  */
    3359           0 : void *vzalloc_node(unsigned long size, int node)
    3360             : {
    3361           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
    3362           0 :                                 __builtin_return_address(0));
    3363             : }
    3364             : EXPORT_SYMBOL(vzalloc_node);
    3365             : 
    3366             : #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
    3367             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3368             : #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
    3369             : #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
    3370             : #else
    3371             : /*
    3372             :  * 64b systems should always have either DMA or DMA32 zones. For others
    3373             :  * GFP_DMA32 should do the right thing and use the normal zone.
    3374             :  */
    3375             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3376             : #endif
    3377             : 
    3378             : /**
    3379             :  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
    3380             :  * @size:       allocation size
    3381             :  *
    3382             :  * Allocate enough 32bit PA addressable pages to cover @size from the
    3383             :  * page level allocator and map them into contiguous kernel virtual space.
    3384             :  *
    3385             :  * Return: pointer to the allocated memory or %NULL on error
    3386             :  */
    3387           0 : void *vmalloc_32(unsigned long size)
    3388             : {
    3389           0 :         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
    3390           0 :                         __builtin_return_address(0));
    3391             : }
    3392             : EXPORT_SYMBOL(vmalloc_32);
    3393             : 
    3394             : /**
    3395             :  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
    3396             :  * @size:            allocation size
    3397             :  *
    3398             :  * The resulting memory area is 32bit addressable and zeroed so it can be
    3399             :  * mapped to userspace without leaking data.
    3400             :  *
    3401             :  * Return: pointer to the allocated memory or %NULL on error
    3402             :  */
    3403           0 : void *vmalloc_32_user(unsigned long size)
    3404             : {
    3405           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3406           0 :                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
    3407             :                                     VM_USERMAP, NUMA_NO_NODE,
    3408           0 :                                     __builtin_return_address(0));
    3409             : }
    3410             : EXPORT_SYMBOL(vmalloc_32_user);
    3411             : 
    3412             : /*
    3413             :  * small helper routine , copy contents to buf from addr.
    3414             :  * If the page is not present, fill zero.
    3415             :  */
    3416             : 
    3417           0 : static int aligned_vread(char *buf, char *addr, unsigned long count)
    3418             : {
    3419             :         struct page *p;
    3420           0 :         int copied = 0;
    3421             : 
    3422           0 :         while (count) {
    3423             :                 unsigned long offset, length;
    3424             : 
    3425           0 :                 offset = offset_in_page(addr);
    3426           0 :                 length = PAGE_SIZE - offset;
    3427           0 :                 if (length > count)
    3428           0 :                         length = count;
    3429           0 :                 p = vmalloc_to_page(addr);
    3430             :                 /*
    3431             :                  * To do safe access to this _mapped_ area, we need
    3432             :                  * lock. But adding lock here means that we need to add
    3433             :                  * overhead of vmalloc()/vfree() calls for this _debug_
    3434             :                  * interface, rarely used. Instead of that, we'll use
    3435             :                  * kmap() and get small overhead in this access function.
    3436             :                  */
    3437           0 :                 if (p) {
    3438             :                         /* We can expect USER0 is not used -- see vread() */
    3439           0 :                         void *map = kmap_atomic(p);
    3440           0 :                         memcpy(buf, map + offset, length);
    3441             :                         kunmap_atomic(map);
    3442             :                 } else
    3443           0 :                         memset(buf, 0, length);
    3444             : 
    3445           0 :                 addr += length;
    3446           0 :                 buf += length;
    3447           0 :                 copied += length;
    3448           0 :                 count -= length;
    3449             :         }
    3450           0 :         return copied;
    3451             : }
    3452             : 
    3453             : /**
    3454             :  * vread() - read vmalloc area in a safe way.
    3455             :  * @buf:     buffer for reading data
    3456             :  * @addr:    vm address.
    3457             :  * @count:   number of bytes to be read.
    3458             :  *
    3459             :  * This function checks that addr is a valid vmalloc'ed area, and
    3460             :  * copy data from that area to a given buffer. If the given memory range
    3461             :  * of [addr...addr+count) includes some valid address, data is copied to
    3462             :  * proper area of @buf. If there are memory holes, they'll be zero-filled.
    3463             :  * IOREMAP area is treated as memory hole and no copy is done.
    3464             :  *
    3465             :  * If [addr...addr+count) doesn't includes any intersects with alive
    3466             :  * vm_struct area, returns 0. @buf should be kernel's buffer.
    3467             :  *
    3468             :  * Note: In usual ops, vread() is never necessary because the caller
    3469             :  * should know vmalloc() area is valid and can use memcpy().
    3470             :  * This is for routines which have to access vmalloc area without
    3471             :  * any information, as /proc/kcore.
    3472             :  *
    3473             :  * Return: number of bytes for which addr and buf should be increased
    3474             :  * (same number as @count) or %0 if [addr...addr+count) doesn't
    3475             :  * include any intersection with valid vmalloc area
    3476             :  */
    3477           0 : long vread(char *buf, char *addr, unsigned long count)
    3478             : {
    3479             :         struct vmap_area *va;
    3480             :         struct vm_struct *vm;
    3481           0 :         char *vaddr, *buf_start = buf;
    3482           0 :         unsigned long buflen = count;
    3483             :         unsigned long n;
    3484             : 
    3485           0 :         addr = kasan_reset_tag(addr);
    3486             : 
    3487             :         /* Don't allow overflow */
    3488           0 :         if ((unsigned long) addr + count < count)
    3489           0 :                 count = -(unsigned long) addr;
    3490             : 
    3491           0 :         spin_lock(&vmap_area_lock);
    3492           0 :         va = find_vmap_area_exceed_addr((unsigned long)addr);
    3493           0 :         if (!va)
    3494             :                 goto finished;
    3495             : 
    3496             :         /* no intersects with alive vmap_area */
    3497           0 :         if ((unsigned long)addr + count <= va->va_start)
    3498             :                 goto finished;
    3499             : 
    3500           0 :         list_for_each_entry_from(va, &vmap_area_list, list) {
    3501           0 :                 if (!count)
    3502             :                         break;
    3503             : 
    3504           0 :                 if (!va->vm)
    3505           0 :                         continue;
    3506             : 
    3507           0 :                 vm = va->vm;
    3508           0 :                 vaddr = (char *) vm->addr;
    3509           0 :                 if (addr >= vaddr + get_vm_area_size(vm))
    3510           0 :                         continue;
    3511           0 :                 while (addr < vaddr) {
    3512           0 :                         if (count == 0)
    3513             :                                 goto finished;
    3514           0 :                         *buf = '\0';
    3515           0 :                         buf++;
    3516           0 :                         addr++;
    3517           0 :                         count--;
    3518             :                 }
    3519           0 :                 n = vaddr + get_vm_area_size(vm) - addr;
    3520           0 :                 if (n > count)
    3521           0 :                         n = count;
    3522           0 :                 if (!(vm->flags & VM_IOREMAP))
    3523           0 :                         aligned_vread(buf, addr, n);
    3524             :                 else /* IOREMAP area is treated as memory hole */
    3525           0 :                         memset(buf, 0, n);
    3526           0 :                 buf += n;
    3527           0 :                 addr += n;
    3528           0 :                 count -= n;
    3529             :         }
    3530             : finished:
    3531           0 :         spin_unlock(&vmap_area_lock);
    3532             : 
    3533           0 :         if (buf == buf_start)
    3534             :                 return 0;
    3535             :         /* zero-fill memory holes */
    3536           0 :         if (buf != buf_start + buflen)
    3537           0 :                 memset(buf, 0, buflen - (buf - buf_start));
    3538             : 
    3539           0 :         return buflen;
    3540             : }
    3541             : 
    3542             : /**
    3543             :  * remap_vmalloc_range_partial - map vmalloc pages to userspace
    3544             :  * @vma:                vma to cover
    3545             :  * @uaddr:              target user address to start at
    3546             :  * @kaddr:              virtual address of vmalloc kernel memory
    3547             :  * @pgoff:              offset from @kaddr to start at
    3548             :  * @size:               size of map area
    3549             :  *
    3550             :  * Returns:     0 for success, -Exxx on failure
    3551             :  *
    3552             :  * This function checks that @kaddr is a valid vmalloc'ed area,
    3553             :  * and that it is big enough to cover the range starting at
    3554             :  * @uaddr in @vma. Will return failure if that criteria isn't
    3555             :  * met.
    3556             :  *
    3557             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3558             :  */
    3559           0 : int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
    3560             :                                 void *kaddr, unsigned long pgoff,
    3561             :                                 unsigned long size)
    3562             : {
    3563             :         struct vm_struct *area;
    3564             :         unsigned long off;
    3565             :         unsigned long end_index;
    3566             : 
    3567           0 :         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
    3568             :                 return -EINVAL;
    3569             : 
    3570           0 :         size = PAGE_ALIGN(size);
    3571             : 
    3572           0 :         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
    3573             :                 return -EINVAL;
    3574             : 
    3575           0 :         area = find_vm_area(kaddr);
    3576           0 :         if (!area)
    3577             :                 return -EINVAL;
    3578             : 
    3579           0 :         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
    3580             :                 return -EINVAL;
    3581             : 
    3582           0 :         if (check_add_overflow(size, off, &end_index) ||
    3583             :             end_index > get_vm_area_size(area))
    3584             :                 return -EINVAL;
    3585           0 :         kaddr += off;
    3586             : 
    3587             :         do {
    3588           0 :                 struct page *page = vmalloc_to_page(kaddr);
    3589             :                 int ret;
    3590             : 
    3591           0 :                 ret = vm_insert_page(vma, uaddr, page);
    3592           0 :                 if (ret)
    3593             :                         return ret;
    3594             : 
    3595           0 :                 uaddr += PAGE_SIZE;
    3596           0 :                 kaddr += PAGE_SIZE;
    3597           0 :                 size -= PAGE_SIZE;
    3598           0 :         } while (size > 0);
    3599             : 
    3600           0 :         vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
    3601             : 
    3602           0 :         return 0;
    3603             : }
    3604             : 
    3605             : /**
    3606             :  * remap_vmalloc_range - map vmalloc pages to userspace
    3607             :  * @vma:                vma to cover (map full range of vma)
    3608             :  * @addr:               vmalloc memory
    3609             :  * @pgoff:              number of pages into addr before first page to map
    3610             :  *
    3611             :  * Returns:     0 for success, -Exxx on failure
    3612             :  *
    3613             :  * This function checks that addr is a valid vmalloc'ed area, and
    3614             :  * that it is big enough to cover the vma. Will return failure if
    3615             :  * that criteria isn't met.
    3616             :  *
    3617             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3618             :  */
    3619           0 : int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
    3620             :                                                 unsigned long pgoff)
    3621             : {
    3622           0 :         return remap_vmalloc_range_partial(vma, vma->vm_start,
    3623             :                                            addr, pgoff,
    3624           0 :                                            vma->vm_end - vma->vm_start);
    3625             : }
    3626             : EXPORT_SYMBOL(remap_vmalloc_range);
    3627             : 
    3628           0 : void free_vm_area(struct vm_struct *area)
    3629             : {
    3630             :         struct vm_struct *ret;
    3631           0 :         ret = remove_vm_area(area->addr);
    3632           0 :         BUG_ON(ret != area);
    3633           0 :         kfree(area);
    3634           0 : }
    3635             : EXPORT_SYMBOL_GPL(free_vm_area);
    3636             : 
    3637             : #ifdef CONFIG_SMP
    3638             : static struct vmap_area *node_to_va(struct rb_node *n)
    3639             : {
    3640             :         return rb_entry_safe(n, struct vmap_area, rb_node);
    3641             : }
    3642             : 
    3643             : /**
    3644             :  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
    3645             :  * @addr: target address
    3646             :  *
    3647             :  * Returns: vmap_area if it is found. If there is no such area
    3648             :  *   the first highest(reverse order) vmap_area is returned
    3649             :  *   i.e. va->va_start < addr && va->va_end < addr or NULL
    3650             :  *   if there are no any areas before @addr.
    3651             :  */
    3652             : static struct vmap_area *
    3653             : pvm_find_va_enclose_addr(unsigned long addr)
    3654             : {
    3655             :         struct vmap_area *va, *tmp;
    3656             :         struct rb_node *n;
    3657             : 
    3658             :         n = free_vmap_area_root.rb_node;
    3659             :         va = NULL;
    3660             : 
    3661             :         while (n) {
    3662             :                 tmp = rb_entry(n, struct vmap_area, rb_node);
    3663             :                 if (tmp->va_start <= addr) {
    3664             :                         va = tmp;
    3665             :                         if (tmp->va_end >= addr)
    3666             :                                 break;
    3667             : 
    3668             :                         n = n->rb_right;
    3669             :                 } else {
    3670             :                         n = n->rb_left;
    3671             :                 }
    3672             :         }
    3673             : 
    3674             :         return va;
    3675             : }
    3676             : 
    3677             : /**
    3678             :  * pvm_determine_end_from_reverse - find the highest aligned address
    3679             :  * of free block below VMALLOC_END
    3680             :  * @va:
    3681             :  *   in - the VA we start the search(reverse order);
    3682             :  *   out - the VA with the highest aligned end address.
    3683             :  * @align: alignment for required highest address
    3684             :  *
    3685             :  * Returns: determined end address within vmap_area
    3686             :  */
    3687             : static unsigned long
    3688             : pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
    3689             : {
    3690             :         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3691             :         unsigned long addr;
    3692             : 
    3693             :         if (likely(*va)) {
    3694             :                 list_for_each_entry_from_reverse((*va),
    3695             :                                 &free_vmap_area_list, list) {
    3696             :                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
    3697             :                         if ((*va)->va_start < addr)
    3698             :                                 return addr;
    3699             :                 }
    3700             :         }
    3701             : 
    3702             :         return 0;
    3703             : }
    3704             : 
    3705             : /**
    3706             :  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
    3707             :  * @offsets: array containing offset of each area
    3708             :  * @sizes: array containing size of each area
    3709             :  * @nr_vms: the number of areas to allocate
    3710             :  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
    3711             :  *
    3712             :  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
    3713             :  *          vm_structs on success, %NULL on failure
    3714             :  *
    3715             :  * Percpu allocator wants to use congruent vm areas so that it can
    3716             :  * maintain the offsets among percpu areas.  This function allocates
    3717             :  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
    3718             :  * be scattered pretty far, distance between two areas easily going up
    3719             :  * to gigabytes.  To avoid interacting with regular vmallocs, these
    3720             :  * areas are allocated from top.
    3721             :  *
    3722             :  * Despite its complicated look, this allocator is rather simple. It
    3723             :  * does everything top-down and scans free blocks from the end looking
    3724             :  * for matching base. While scanning, if any of the areas do not fit the
    3725             :  * base address is pulled down to fit the area. Scanning is repeated till
    3726             :  * all the areas fit and then all necessary data structures are inserted
    3727             :  * and the result is returned.
    3728             :  */
    3729             : struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
    3730             :                                      const size_t *sizes, int nr_vms,
    3731             :                                      size_t align)
    3732             : {
    3733             :         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
    3734             :         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3735             :         struct vmap_area **vas, *va;
    3736             :         struct vm_struct **vms;
    3737             :         int area, area2, last_area, term_area;
    3738             :         unsigned long base, start, size, end, last_end, orig_start, orig_end;
    3739             :         bool purged = false;
    3740             :         enum fit_type type;
    3741             : 
    3742             :         /* verify parameters and allocate data structures */
    3743             :         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
    3744             :         for (last_area = 0, area = 0; area < nr_vms; area++) {
    3745             :                 start = offsets[area];
    3746             :                 end = start + sizes[area];
    3747             : 
    3748             :                 /* is everything aligned properly? */
    3749             :                 BUG_ON(!IS_ALIGNED(offsets[area], align));
    3750             :                 BUG_ON(!IS_ALIGNED(sizes[area], align));
    3751             : 
    3752             :                 /* detect the area with the highest address */
    3753             :                 if (start > offsets[last_area])
    3754             :                         last_area = area;
    3755             : 
    3756             :                 for (area2 = area + 1; area2 < nr_vms; area2++) {
    3757             :                         unsigned long start2 = offsets[area2];
    3758             :                         unsigned long end2 = start2 + sizes[area2];
    3759             : 
    3760             :                         BUG_ON(start2 < end && start < end2);
    3761             :                 }
    3762             :         }
    3763             :         last_end = offsets[last_area] + sizes[last_area];
    3764             : 
    3765             :         if (vmalloc_end - vmalloc_start < last_end) {
    3766             :                 WARN_ON(true);
    3767             :                 return NULL;
    3768             :         }
    3769             : 
    3770             :         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
    3771             :         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
    3772             :         if (!vas || !vms)
    3773             :                 goto err_free2;
    3774             : 
    3775             :         for (area = 0; area < nr_vms; area++) {
    3776             :                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
    3777             :                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
    3778             :                 if (!vas[area] || !vms[area])
    3779             :                         goto err_free;
    3780             :         }
    3781             : retry:
    3782             :         spin_lock(&free_vmap_area_lock);
    3783             : 
    3784             :         /* start scanning - we scan from the top, begin with the last area */
    3785             :         area = term_area = last_area;
    3786             :         start = offsets[area];
    3787             :         end = start + sizes[area];
    3788             : 
    3789             :         va = pvm_find_va_enclose_addr(vmalloc_end);
    3790             :         base = pvm_determine_end_from_reverse(&va, align) - end;
    3791             : 
    3792             :         while (true) {
    3793             :                 /*
    3794             :                  * base might have underflowed, add last_end before
    3795             :                  * comparing.
    3796             :                  */
    3797             :                 if (base + last_end < vmalloc_start + last_end)
    3798             :                         goto overflow;
    3799             : 
    3800             :                 /*
    3801             :                  * Fitting base has not been found.
    3802             :                  */
    3803             :                 if (va == NULL)
    3804             :                         goto overflow;
    3805             : 
    3806             :                 /*
    3807             :                  * If required width exceeds current VA block, move
    3808             :                  * base downwards and then recheck.
    3809             :                  */
    3810             :                 if (base + end > va->va_end) {
    3811             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    3812             :                         term_area = area;
    3813             :                         continue;
    3814             :                 }
    3815             : 
    3816             :                 /*
    3817             :                  * If this VA does not fit, move base downwards and recheck.
    3818             :                  */
    3819             :                 if (base + start < va->va_start) {
    3820             :                         va = node_to_va(rb_prev(&va->rb_node));
    3821             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    3822             :                         term_area = area;
    3823             :                         continue;
    3824             :                 }
    3825             : 
    3826             :                 /*
    3827             :                  * This area fits, move on to the previous one.  If
    3828             :                  * the previous one is the terminal one, we're done.
    3829             :                  */
    3830             :                 area = (area + nr_vms - 1) % nr_vms;
    3831             :                 if (area == term_area)
    3832             :                         break;
    3833             : 
    3834             :                 start = offsets[area];
    3835             :                 end = start + sizes[area];
    3836             :                 va = pvm_find_va_enclose_addr(base + end);
    3837             :         }
    3838             : 
    3839             :         /* we've found a fitting base, insert all va's */
    3840             :         for (area = 0; area < nr_vms; area++) {
    3841             :                 int ret;
    3842             : 
    3843             :                 start = base + offsets[area];
    3844             :                 size = sizes[area];
    3845             : 
    3846             :                 va = pvm_find_va_enclose_addr(start);
    3847             :                 if (WARN_ON_ONCE(va == NULL))
    3848             :                         /* It is a BUG(), but trigger recovery instead. */
    3849             :                         goto recovery;
    3850             : 
    3851             :                 type = classify_va_fit_type(va, start, size);
    3852             :                 if (WARN_ON_ONCE(type == NOTHING_FIT))
    3853             :                         /* It is a BUG(), but trigger recovery instead. */
    3854             :                         goto recovery;
    3855             : 
    3856             :                 ret = adjust_va_to_fit_type(va, start, size, type);
    3857             :                 if (unlikely(ret))
    3858             :                         goto recovery;
    3859             : 
    3860             :                 /* Allocated area. */
    3861             :                 va = vas[area];
    3862             :                 va->va_start = start;
    3863             :                 va->va_end = start + size;
    3864             :         }
    3865             : 
    3866             :         spin_unlock(&free_vmap_area_lock);
    3867             : 
    3868             :         /* populate the kasan shadow space */
    3869             :         for (area = 0; area < nr_vms; area++) {
    3870             :                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
    3871             :                         goto err_free_shadow;
    3872             :         }
    3873             : 
    3874             :         /* insert all vm's */
    3875             :         spin_lock(&vmap_area_lock);
    3876             :         for (area = 0; area < nr_vms; area++) {
    3877             :                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
    3878             : 
    3879             :                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
    3880             :                                  pcpu_get_vm_areas);
    3881             :         }
    3882             :         spin_unlock(&vmap_area_lock);
    3883             : 
    3884             :         /*
    3885             :          * Mark allocated areas as accessible. Do it now as a best-effort
    3886             :          * approach, as they can be mapped outside of vmalloc code.
    3887             :          * With hardware tag-based KASAN, marking is skipped for
    3888             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    3889             :          */
    3890             :         for (area = 0; area < nr_vms; area++)
    3891             :                 vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
    3892             :                                 vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
    3893             : 
    3894             :         kfree(vas);
    3895             :         return vms;
    3896             : 
    3897             : recovery:
    3898             :         /*
    3899             :          * Remove previously allocated areas. There is no
    3900             :          * need in removing these areas from the busy tree,
    3901             :          * because they are inserted only on the final step
    3902             :          * and when pcpu_get_vm_areas() is success.
    3903             :          */
    3904             :         while (area--) {
    3905             :                 orig_start = vas[area]->va_start;
    3906             :                 orig_end = vas[area]->va_end;
    3907             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    3908             :                                 &free_vmap_area_list);
    3909             :                 if (va)
    3910             :                         kasan_release_vmalloc(orig_start, orig_end,
    3911             :                                 va->va_start, va->va_end);
    3912             :                 vas[area] = NULL;
    3913             :         }
    3914             : 
    3915             : overflow:
    3916             :         spin_unlock(&free_vmap_area_lock);
    3917             :         if (!purged) {
    3918             :                 purge_vmap_area_lazy();
    3919             :                 purged = true;
    3920             : 
    3921             :                 /* Before "retry", check if we recover. */
    3922             :                 for (area = 0; area < nr_vms; area++) {
    3923             :                         if (vas[area])
    3924             :                                 continue;
    3925             : 
    3926             :                         vas[area] = kmem_cache_zalloc(
    3927             :                                 vmap_area_cachep, GFP_KERNEL);
    3928             :                         if (!vas[area])
    3929             :                                 goto err_free;
    3930             :                 }
    3931             : 
    3932             :                 goto retry;
    3933             :         }
    3934             : 
    3935             : err_free:
    3936             :         for (area = 0; area < nr_vms; area++) {
    3937             :                 if (vas[area])
    3938             :                         kmem_cache_free(vmap_area_cachep, vas[area]);
    3939             : 
    3940             :                 kfree(vms[area]);
    3941             :         }
    3942             : err_free2:
    3943             :         kfree(vas);
    3944             :         kfree(vms);
    3945             :         return NULL;
    3946             : 
    3947             : err_free_shadow:
    3948             :         spin_lock(&free_vmap_area_lock);
    3949             :         /*
    3950             :          * We release all the vmalloc shadows, even the ones for regions that
    3951             :          * hadn't been successfully added. This relies on kasan_release_vmalloc
    3952             :          * being able to tolerate this case.
    3953             :          */
    3954             :         for (area = 0; area < nr_vms; area++) {
    3955             :                 orig_start = vas[area]->va_start;
    3956             :                 orig_end = vas[area]->va_end;
    3957             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    3958             :                                 &free_vmap_area_list);
    3959             :                 if (va)
    3960             :                         kasan_release_vmalloc(orig_start, orig_end,
    3961             :                                 va->va_start, va->va_end);
    3962             :                 vas[area] = NULL;
    3963             :                 kfree(vms[area]);
    3964             :         }
    3965             :         spin_unlock(&free_vmap_area_lock);
    3966             :         kfree(vas);
    3967             :         kfree(vms);
    3968             :         return NULL;
    3969             : }
    3970             : 
    3971             : /**
    3972             :  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
    3973             :  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
    3974             :  * @nr_vms: the number of allocated areas
    3975             :  *
    3976             :  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
    3977             :  */
    3978             : void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
    3979             : {
    3980             :         int i;
    3981             : 
    3982             :         for (i = 0; i < nr_vms; i++)
    3983             :                 free_vm_area(vms[i]);
    3984             :         kfree(vms);
    3985             : }
    3986             : #endif  /* CONFIG_SMP */
    3987             : 
    3988             : #ifdef CONFIG_PRINTK
    3989           0 : bool vmalloc_dump_obj(void *object)
    3990             : {
    3991             :         struct vm_struct *vm;
    3992           0 :         void *objp = (void *)PAGE_ALIGN((unsigned long)object);
    3993             : 
    3994           0 :         vm = find_vm_area(objp);
    3995           0 :         if (!vm)
    3996             :                 return false;
    3997           0 :         pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
    3998             :                 vm->nr_pages, (unsigned long)vm->addr, vm->caller);
    3999           0 :         return true;
    4000             : }
    4001             : #endif
    4002             : 
    4003             : #ifdef CONFIG_PROC_FS
    4004           0 : static void *s_start(struct seq_file *m, loff_t *pos)
    4005             :         __acquires(&vmap_purge_lock)
    4006             :         __acquires(&vmap_area_lock)
    4007             : {
    4008           0 :         mutex_lock(&vmap_purge_lock);
    4009           0 :         spin_lock(&vmap_area_lock);
    4010             : 
    4011           0 :         return seq_list_start(&vmap_area_list, *pos);
    4012             : }
    4013             : 
    4014           0 : static void *s_next(struct seq_file *m, void *p, loff_t *pos)
    4015             : {
    4016           0 :         return seq_list_next(p, &vmap_area_list, pos);
    4017             : }
    4018             : 
    4019           0 : static void s_stop(struct seq_file *m, void *p)
    4020             :         __releases(&vmap_area_lock)
    4021             :         __releases(&vmap_purge_lock)
    4022             : {
    4023           0 :         spin_unlock(&vmap_area_lock);
    4024           0 :         mutex_unlock(&vmap_purge_lock);
    4025           0 : }
    4026             : 
    4027             : static void show_numa_info(struct seq_file *m, struct vm_struct *v)
    4028             : {
    4029             :         if (IS_ENABLED(CONFIG_NUMA)) {
    4030             :                 unsigned int nr, *counters = m->private;
    4031             :                 unsigned int step = 1U << vm_area_page_order(v);
    4032             : 
    4033             :                 if (!counters)
    4034             :                         return;
    4035             : 
    4036             :                 if (v->flags & VM_UNINITIALIZED)
    4037             :                         return;
    4038             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    4039             :                 smp_rmb();
    4040             : 
    4041             :                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
    4042             : 
    4043             :                 for (nr = 0; nr < v->nr_pages; nr += step)
    4044             :                         counters[page_to_nid(v->pages[nr])] += step;
    4045             :                 for_each_node_state(nr, N_HIGH_MEMORY)
    4046             :                         if (counters[nr])
    4047             :                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
    4048             :         }
    4049             : }
    4050             : 
    4051           0 : static void show_purge_info(struct seq_file *m)
    4052             : {
    4053             :         struct vmap_area *va;
    4054             : 
    4055           0 :         spin_lock(&purge_vmap_area_lock);
    4056           0 :         list_for_each_entry(va, &purge_vmap_area_list, list) {
    4057           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
    4058             :                         (void *)va->va_start, (void *)va->va_end,
    4059           0 :                         va->va_end - va->va_start);
    4060             :         }
    4061           0 :         spin_unlock(&purge_vmap_area_lock);
    4062           0 : }
    4063             : 
    4064           0 : static int s_show(struct seq_file *m, void *p)
    4065             : {
    4066             :         struct vmap_area *va;
    4067             :         struct vm_struct *v;
    4068             : 
    4069           0 :         va = list_entry(p, struct vmap_area, list);
    4070             : 
    4071             :         /*
    4072             :          * s_show can encounter race with remove_vm_area, !vm on behalf
    4073             :          * of vmap area is being tear down or vm_map_ram allocation.
    4074             :          */
    4075           0 :         if (!va->vm) {
    4076           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
    4077             :                         (void *)va->va_start, (void *)va->va_end,
    4078           0 :                         va->va_end - va->va_start);
    4079             : 
    4080           0 :                 goto final;
    4081             :         }
    4082             : 
    4083           0 :         v = va->vm;
    4084             : 
    4085           0 :         seq_printf(m, "0x%pK-0x%pK %7ld",
    4086           0 :                 v->addr, v->addr + v->size, v->size);
    4087             : 
    4088           0 :         if (v->caller)
    4089           0 :                 seq_printf(m, " %pS", v->caller);
    4090             : 
    4091           0 :         if (v->nr_pages)
    4092           0 :                 seq_printf(m, " pages=%d", v->nr_pages);
    4093             : 
    4094           0 :         if (v->phys_addr)
    4095           0 :                 seq_printf(m, " phys=%pa", &v->phys_addr);
    4096             : 
    4097           0 :         if (v->flags & VM_IOREMAP)
    4098           0 :                 seq_puts(m, " ioremap");
    4099             : 
    4100           0 :         if (v->flags & VM_ALLOC)
    4101           0 :                 seq_puts(m, " vmalloc");
    4102             : 
    4103           0 :         if (v->flags & VM_MAP)
    4104           0 :                 seq_puts(m, " vmap");
    4105             : 
    4106           0 :         if (v->flags & VM_USERMAP)
    4107           0 :                 seq_puts(m, " user");
    4108             : 
    4109           0 :         if (v->flags & VM_DMA_COHERENT)
    4110           0 :                 seq_puts(m, " dma-coherent");
    4111             : 
    4112           0 :         if (is_vmalloc_addr(v->pages))
    4113           0 :                 seq_puts(m, " vpages");
    4114             : 
    4115           0 :         show_numa_info(m, v);
    4116           0 :         seq_putc(m, '\n');
    4117             : 
    4118             :         /*
    4119             :          * As a final step, dump "unpurged" areas.
    4120             :          */
    4121             : final:
    4122           0 :         if (list_is_last(&va->list, &vmap_area_list))
    4123           0 :                 show_purge_info(m);
    4124             : 
    4125           0 :         return 0;
    4126             : }
    4127             : 
    4128             : static const struct seq_operations vmalloc_op = {
    4129             :         .start = s_start,
    4130             :         .next = s_next,
    4131             :         .stop = s_stop,
    4132             :         .show = s_show,
    4133             : };
    4134             : 
    4135           1 : static int __init proc_vmalloc_init(void)
    4136             : {
    4137             :         if (IS_ENABLED(CONFIG_NUMA))
    4138             :                 proc_create_seq_private("vmallocinfo", 0400, NULL,
    4139             :                                 &vmalloc_op,
    4140             :                                 nr_node_ids * sizeof(unsigned int), NULL);
    4141             :         else
    4142           1 :                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
    4143           1 :         return 0;
    4144             : }
    4145             : module_init(proc_vmalloc_init);
    4146             : 
    4147             : #endif

Generated by: LCOV version 1.14