LCOV - coverage.info - mm/page

LCOV - code coverage report

Current view:	top level - mm - page_alloc.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	1013	2036	49.8 %
Date:	2022-12-09 01:23:36	Functions:	90	168	53.6 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/page_alloc.c
       4             :  *
       5             :  *  Manages the free list, the system allocates free pages here.
       6             :  *  Note that kmalloc() lives in slab.c
       7             :  *
       8             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       9             :  *  Swap reorganised 29.12.95, Stephen Tweedie
      10             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
      11             :  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
      12             :  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
      13             :  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
      14             :  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
      15             :  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
      16             :  */
      17             : 
      18             : #include <linux/stddef.h>
      19             : #include <linux/mm.h>
      20             : #include <linux/highmem.h>
      21             : #include <linux/swap.h>
      22             : #include <linux/swapops.h>
      23             : #include <linux/interrupt.h>
      24             : #include <linux/pagemap.h>
      25             : #include <linux/jiffies.h>
      26             : #include <linux/memblock.h>
      27             : #include <linux/compiler.h>
      28             : #include <linux/kernel.h>
      29             : #include <linux/kasan.h>
      30             : #include <linux/module.h>
      31             : #include <linux/suspend.h>
      32             : #include <linux/pagevec.h>
      33             : #include <linux/blkdev.h>
      34             : #include <linux/slab.h>
      35             : #include <linux/ratelimit.h>
      36             : #include <linux/oom.h>
      37             : #include <linux/topology.h>
      38             : #include <linux/sysctl.h>
      39             : #include <linux/cpu.h>
      40             : #include <linux/cpuset.h>
      41             : #include <linux/memory_hotplug.h>
      42             : #include <linux/nodemask.h>
      43             : #include <linux/vmalloc.h>
      44             : #include <linux/vmstat.h>
      45             : #include <linux/mempolicy.h>
      46             : #include <linux/memremap.h>
      47             : #include <linux/stop_machine.h>
      48             : #include <linux/random.h>
      49             : #include <linux/sort.h>
      50             : #include <linux/pfn.h>
      51             : #include <linux/backing-dev.h>
      52             : #include <linux/fault-inject.h>
      53             : #include <linux/page-isolation.h>
      54             : #include <linux/debugobjects.h>
      55             : #include <linux/kmemleak.h>
      56             : #include <linux/compaction.h>
      57             : #include <trace/events/kmem.h>
      58             : #include <trace/events/oom.h>
      59             : #include <linux/prefetch.h>
      60             : #include <linux/mm_inline.h>
      61             : #include <linux/mmu_notifier.h>
      62             : #include <linux/migrate.h>
      63             : #include <linux/hugetlb.h>
      64             : #include <linux/sched/rt.h>
      65             : #include <linux/sched/mm.h>
      66             : #include <linux/page_owner.h>
      67             : #include <linux/page_table_check.h>
      68             : #include <linux/kthread.h>
      69             : #include <linux/memcontrol.h>
      70             : #include <linux/ftrace.h>
      71             : #include <linux/lockdep.h>
      72             : #include <linux/nmi.h>
      73             : #include <linux/psi.h>
      74             : #include <linux/padata.h>
      75             : #include <linux/khugepaged.h>
      76             : #include <linux/buffer_head.h>
      77             : #include <linux/delayacct.h>
      78             : #include <asm/sections.h>
      79             : #include <asm/tlbflush.h>
      80             : #include <asm/div64.h>
      81             : #include "internal.h"
      82             : #include "shuffle.h"
      83             : #include "page_reporting.h"
      84             : 
      85             : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
      86             : typedef int __bitwise fpi_t;
      87             : 
      88             : /* No special request */
      89             : #define FPI_NONE                ((__force fpi_t)0)
      90             : 
      91             : /*
      92             :  * Skip free page reporting notification for the (possibly merged) page.
      93             :  * This does not hinder free page reporting from grabbing the page,
      94             :  * reporting it and marking it "reported" -  it only skips notifying
      95             :  * the free page reporting infrastructure about a newly freed page. For
      96             :  * example, used when temporarily pulling a page from a freelist and
      97             :  * putting it back unmodified.
      98             :  */
      99             : #define FPI_SKIP_REPORT_NOTIFY  ((__force fpi_t)BIT(0))
     100             : 
     101             : /*
     102             :  * Place the (possibly merged) page to the tail of the freelist. Will ignore
     103             :  * page shuffling (relevant code - e.g., memory onlining - is expected to
     104             :  * shuffle the whole zone).
     105             :  *
     106             :  * Note: No code should rely on this flag for correctness - it's purely
     107             :  *       to allow for optimizations when handing back either fresh pages
     108             :  *       (memory onlining) or untouched pages (page isolation, free page
     109             :  *       reporting).
     110             :  */
     111             : #define FPI_TO_TAIL             ((__force fpi_t)BIT(1))
     112             : 
     113             : /*
     114             :  * Don't poison memory with KASAN (only for the tag-based modes).
     115             :  * During boot, all non-reserved memblock memory is exposed to page_alloc.
     116             :  * Poisoning all that memory lengthens boot time, especially on systems with
     117             :  * large amount of RAM. This flag is used to skip that poisoning.
     118             :  * This is only done for the tag-based KASAN modes, as those are able to
     119             :  * detect memory corruptions with the memory tags assigned by default.
     120             :  * All memory allocated normally after boot gets poisoned as usual.
     121             :  */
     122             : #define FPI_SKIP_KASAN_POISON   ((__force fpi_t)BIT(2))
     123             : 
     124             : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
     125             : static DEFINE_MUTEX(pcp_batch_high_lock);
     126             : #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
     127             : 
     128             : struct pagesets {
     129             :         local_lock_t lock;
     130             : };
     131             : static DEFINE_PER_CPU(struct pagesets, pagesets) = {
     132             :         .lock = INIT_LOCAL_LOCK(lock),
     133             : };
     134             : 
     135             : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
     136             : DEFINE_PER_CPU(int, numa_node);
     137             : EXPORT_PER_CPU_SYMBOL(numa_node);
     138             : #endif
     139             : 
     140             : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
     141             : 
     142             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
     143             : /*
     144             :  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
     145             :  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
     146             :  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
     147             :  * defined in <linux/topology.h>.
     148             :  */
     149             : DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
     150             : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
     151             : #endif
     152             : 
     153             : /* work_structs for global per-cpu drains */
     154             : struct pcpu_drain {
     155             :         struct zone *zone;
     156             :         struct work_struct work;
     157             : };
     158             : static DEFINE_MUTEX(pcpu_drain_mutex);
     159             : static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
     160             : 
     161             : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
     162             : volatile unsigned long latent_entropy __latent_entropy;
     163             : EXPORT_SYMBOL(latent_entropy);
     164             : #endif
     165             : 
     166             : /*
     167             :  * Array of node states.
     168             :  */
     169             : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
     170             :         [N_POSSIBLE] = NODE_MASK_ALL,
     171             :         [N_ONLINE] = { { [0] = 1UL } },
     172             : #ifndef CONFIG_NUMA
     173             :         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
     174             : #ifdef CONFIG_HIGHMEM
     175             :         [N_HIGH_MEMORY] = { { [0] = 1UL } },
     176             : #endif
     177             :         [N_MEMORY] = { { [0] = 1UL } },
     178             :         [N_CPU] = { { [0] = 1UL } },
     179             : #endif  /* NUMA */
     180             : };
     181             : EXPORT_SYMBOL(node_states);
     182             : 
     183             : atomic_long_t _totalram_pages __read_mostly;
     184             : EXPORT_SYMBOL(_totalram_pages);
     185             : unsigned long totalreserve_pages __read_mostly;
     186             : unsigned long totalcma_pages __read_mostly;
     187             : 
     188             : int percpu_pagelist_high_fraction;
     189             : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
     190             : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
     191             : EXPORT_SYMBOL(init_on_alloc);
     192             : 
     193             : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
     194             : EXPORT_SYMBOL(init_on_free);
     195             : 
     196             : static bool _init_on_alloc_enabled_early __read_mostly
     197             :                                 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
     198           0 : static int __init early_init_on_alloc(char *buf)
     199             : {
     200             : 
     201           0 :         return kstrtobool(buf, &_init_on_alloc_enabled_early);
     202             : }
     203             : early_param("init_on_alloc", early_init_on_alloc);
     204             : 
     205             : static bool _init_on_free_enabled_early __read_mostly
     206             :                                 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
     207           0 : static int __init early_init_on_free(char *buf)
     208             : {
     209           0 :         return kstrtobool(buf, &_init_on_free_enabled_early);
     210             : }
     211             : early_param("init_on_free", early_init_on_free);
     212             : 
     213             : /*
     214             :  * A cached value of the page's pageblock's migratetype, used when the page is
     215             :  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
     216             :  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
     217             :  * Also the migratetype set in the page does not necessarily match the pcplist
     218             :  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
     219             :  * other index - this ensures that it will be put on the correct CMA freelist.
     220             :  */
     221             : static inline int get_pcppage_migratetype(struct page *page)
     222             : {
     223           3 :         return page->index;
     224             : }
     225             : 
     226             : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
     227             : {
     228         694 :         page->index = migratetype;
     229             : }
     230             : 
     231             : #ifdef CONFIG_PM_SLEEP
     232             : /*
     233             :  * The following functions are used by the suspend/hibernate code to temporarily
     234             :  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
     235             :  * while devices are suspended.  To avoid races with the suspend/hibernate code,
     236             :  * they should always be called with system_transition_mutex held
     237             :  * (gfp_allowed_mask also should only be modified with system_transition_mutex
     238             :  * held, unless the suspend/hibernate code is guaranteed not to run in parallel
     239             :  * with that modification).
     240             :  */
     241             : 
     242             : static gfp_t saved_gfp_mask;
     243             : 
     244           0 : void pm_restore_gfp_mask(void)
     245             : {
     246           0 :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     247           0 :         if (saved_gfp_mask) {
     248           0 :                 gfp_allowed_mask = saved_gfp_mask;
     249           0 :                 saved_gfp_mask = 0;
     250             :         }
     251           0 : }
     252             : 
     253           0 : void pm_restrict_gfp_mask(void)
     254             : {
     255           0 :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     256           0 :         WARN_ON(saved_gfp_mask);
     257           0 :         saved_gfp_mask = gfp_allowed_mask;
     258           0 :         gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
     259           0 : }
     260             : 
     261           0 : bool pm_suspended_storage(void)
     262             : {
     263           0 :         if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
     264             :                 return false;
     265           0 :         return true;
     266             : }
     267             : #endif /* CONFIG_PM_SLEEP */
     268             : 
     269             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
     270             : unsigned int pageblock_order __read_mostly;
     271             : #endif
     272             : 
     273             : static void __free_pages_ok(struct page *page, unsigned int order,
     274             :                             fpi_t fpi_flags);
     275             : 
     276             : /*
     277             :  * results with 256, 32 in the lowmem_reserve sysctl:
     278             :  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
     279             :  *      1G machine -> (16M dma, 784M normal, 224M high)
     280             :  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
     281             :  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
     282             :  *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
     283             :  *
     284             :  * TBD: should special case ZONE_DMA32 machines here - in those we normally
     285             :  * don't need any ZONE_NORMAL reservation
     286             :  */
     287             : int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
     288             : #ifdef CONFIG_ZONE_DMA
     289             :         [ZONE_DMA] = 256,
     290             : #endif
     291             : #ifdef CONFIG_ZONE_DMA32
     292             :         [ZONE_DMA32] = 256,
     293             : #endif
     294             :         [ZONE_NORMAL] = 32,
     295             : #ifdef CONFIG_HIGHMEM
     296             :         [ZONE_HIGHMEM] = 0,
     297             : #endif
     298             :         [ZONE_MOVABLE] = 0,
     299             : };
     300             : 
     301             : static char * const zone_names[MAX_NR_ZONES] = {
     302             : #ifdef CONFIG_ZONE_DMA
     303             :          "DMA",
     304             : #endif
     305             : #ifdef CONFIG_ZONE_DMA32
     306             :          "DMA32",
     307             : #endif
     308             :          "Normal",
     309             : #ifdef CONFIG_HIGHMEM
     310             :          "HighMem",
     311             : #endif
     312             :          "Movable",
     313             : #ifdef CONFIG_ZONE_DEVICE
     314             :          "Device",
     315             : #endif
     316             : };
     317             : 
     318             : const char * const migratetype_names[MIGRATE_TYPES] = {
     319             :         "Unmovable",
     320             :         "Movable",
     321             :         "Reclaimable",
     322             :         "HighAtomic",
     323             : #ifdef CONFIG_CMA
     324             :         "CMA",
     325             : #endif
     326             : #ifdef CONFIG_MEMORY_ISOLATION
     327             :         "Isolate",
     328             : #endif
     329             : };
     330             : 
     331             : compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
     332             :         [NULL_COMPOUND_DTOR] = NULL,
     333             :         [COMPOUND_PAGE_DTOR] = free_compound_page,
     334             : #ifdef CONFIG_HUGETLB_PAGE
     335             :         [HUGETLB_PAGE_DTOR] = free_huge_page,
     336             : #endif
     337             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     338             :         [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
     339             : #endif
     340             : };
     341             : 
     342             : int min_free_kbytes = 1024;
     343             : int user_min_free_kbytes = -1;
     344             : int watermark_boost_factor __read_mostly = 15000;
     345             : int watermark_scale_factor = 10;
     346             : 
     347             : static unsigned long nr_kernel_pages __initdata;
     348             : static unsigned long nr_all_pages __initdata;
     349             : static unsigned long dma_reserve __initdata;
     350             : 
     351             : static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
     352             : static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
     353             : static unsigned long required_kernelcore __initdata;
     354             : static unsigned long required_kernelcore_percent __initdata;
     355             : static unsigned long required_movablecore __initdata;
     356             : static unsigned long required_movablecore_percent __initdata;
     357             : static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
     358             : static bool mirrored_kernelcore __meminitdata;
     359             : 
     360             : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
     361             : int movable_zone;
     362             : EXPORT_SYMBOL(movable_zone);
     363             : 
     364             : #if MAX_NUMNODES > 1
     365             : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
     366             : unsigned int nr_online_nodes __read_mostly = 1;
     367             : EXPORT_SYMBOL(nr_node_ids);
     368             : EXPORT_SYMBOL(nr_online_nodes);
     369             : #endif
     370             : 
     371             : int page_group_by_mobility_disabled __read_mostly;
     372             : 
     373             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     374             : /*
     375             :  * During boot we initialize deferred pages on-demand, as needed, but once
     376             :  * page_alloc_init_late() has finished, the deferred pages are all initialized,
     377             :  * and we can permanently disable that path.
     378             :  */
     379             : static DEFINE_STATIC_KEY_TRUE(deferred_pages);
     380             : 
     381             : static inline bool deferred_pages_enabled(void)
     382             : {
     383             :         return static_branch_unlikely(&deferred_pages);
     384             : }
     385             : 
     386             : /* Returns true if the struct page for the pfn is uninitialised */
     387             : static inline bool __meminit early_page_uninitialised(unsigned long pfn)
     388             : {
     389             :         int nid = early_pfn_to_nid(pfn);
     390             : 
     391             :         if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
     392             :                 return true;
     393             : 
     394             :         return false;
     395             : }
     396             : 
     397             : /*
     398             :  * Returns true when the remaining initialisation should be deferred until
     399             :  * later in the boot cycle when it can be parallelised.
     400             :  */
     401             : static bool __meminit
     402             : defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
     403             : {
     404             :         static unsigned long prev_end_pfn, nr_initialised;
     405             : 
     406             :         /*
     407             :          * prev_end_pfn static that contains the end of previous zone
     408             :          * No need to protect because called very early in boot before smp_init.
     409             :          */
     410             :         if (prev_end_pfn != end_pfn) {
     411             :                 prev_end_pfn = end_pfn;
     412             :                 nr_initialised = 0;
     413             :         }
     414             : 
     415             :         /* Always populate low zones for address-constrained allocations */
     416             :         if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
     417             :                 return false;
     418             : 
     419             :         if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
     420             :                 return true;
     421             :         /*
     422             :          * We start only with one section of pages, more pages are added as
     423             :          * needed until the rest of deferred pages are initialized.
     424             :          */
     425             :         nr_initialised++;
     426             :         if ((nr_initialised > PAGES_PER_SECTION) &&
     427             :             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
     428             :                 NODE_DATA(nid)->first_deferred_pfn = pfn;
     429             :                 return true;
     430             :         }
     431             :         return false;
     432             : }
     433             : #else
     434             : static inline bool deferred_pages_enabled(void)
     435             : {
     436             :         return false;
     437             : }
     438             : 
     439             : static inline bool early_page_uninitialised(unsigned long pfn)
     440             : {
     441             :         return false;
     442             : }
     443             : 
     444             : static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
     445             : {
     446             :         return false;
     447             : }
     448             : #endif
     449             : 
     450             : /* Return a pointer to the bitmap storing bits affecting a block of pages */
     451             : static inline unsigned long *get_pageblock_bitmap(const struct page *page,
     452             :                                                         unsigned long pfn)
     453             : {
     454             : #ifdef CONFIG_SPARSEMEM
     455             :         return section_to_usemap(__pfn_to_section(pfn));
     456             : #else
     457         530 :         return page_zone(page)->pageblock_flags;
     458             : #endif /* CONFIG_SPARSEMEM */
     459             : }
     460             : 
     461             : static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
     462             : {
     463             : #ifdef CONFIG_SPARSEMEM
     464             :         pfn &= (PAGES_PER_SECTION-1);
     465             : #else
     466         530 :         pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
     467             : #endif /* CONFIG_SPARSEMEM */
     468         530 :         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
     469             : }
     470             : 
     471             : static __always_inline
     472             : unsigned long __get_pfnblock_flags_mask(const struct page *page,
     473             :                                         unsigned long pfn,
     474             :                                         unsigned long mask)
     475             : {
     476             :         unsigned long *bitmap;
     477             :         unsigned long bitidx, word_bitidx;
     478             :         unsigned long word;
     479             : 
     480         536 :         bitmap = get_pageblock_bitmap(page, pfn);
     481         268 :         bitidx = pfn_to_bitidx(page, pfn);
     482         268 :         word_bitidx = bitidx / BITS_PER_LONG;
     483         268 :         bitidx &= (BITS_PER_LONG-1);
     484             : 
     485         268 :         word = bitmap[word_bitidx];
     486         268 :         return (word >> bitidx) & mask;
     487             : }
     488             : 
     489             : /**
     490             :  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
     491             :  * @page: The page within the block of interest
     492             :  * @pfn: The target page frame number
     493             :  * @mask: mask of bits that the caller is interested in
     494             :  *
     495             :  * Return: pageblock_bits flags
     496             :  */
     497           0 : unsigned long get_pfnblock_flags_mask(const struct page *page,
     498             :                                         unsigned long pfn, unsigned long mask)
     499             : {
     500           2 :         return __get_pfnblock_flags_mask(page, pfn, mask);
     501             : }
     502             : 
     503             : static __always_inline int get_pfnblock_migratetype(const struct page *page,
     504             :                                         unsigned long pfn)
     505             : {
     506         266 :         return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
     507             : }
     508             : 
     509             : /**
     510             :  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
     511             :  * @page: The page within the block of interest
     512             :  * @flags: The flags to set
     513             :  * @pfn: The target page frame number
     514             :  * @mask: mask of bits that the caller is interested in
     515             :  */
     516         262 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
     517             :                                         unsigned long pfn,
     518             :                                         unsigned long mask)
     519             : {
     520             :         unsigned long *bitmap;
     521             :         unsigned long bitidx, word_bitidx;
     522             :         unsigned long old_word, word;
     523             : 
     524             :         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
     525             :         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
     526             : 
     527         524 :         bitmap = get_pageblock_bitmap(page, pfn);
     528         262 :         bitidx = pfn_to_bitidx(page, pfn);
     529         262 :         word_bitidx = bitidx / BITS_PER_LONG;
     530         262 :         bitidx &= (BITS_PER_LONG-1);
     531             : 
     532             :         VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
     533             : 
     534         262 :         mask <<= bitidx;
     535         262 :         flags <<= bitidx;
     536             : 
     537         262 :         word = READ_ONCE(bitmap[word_bitidx]);
     538             :         for (;;) {
     539         524 :                 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
     540         262 :                 if (word == old_word)
     541             :                         break;
     542             :                 word = old_word;
     543             :         }
     544         262 : }
     545             : 
     546         262 : void set_pageblock_migratetype(struct page *page, int migratetype)
     547             : {
     548         262 :         if (unlikely(page_group_by_mobility_disabled &&
     549             :                      migratetype < MIGRATE_PCPTYPES))
     550           0 :                 migratetype = MIGRATE_UNMOVABLE;
     551             : 
     552         262 :         set_pfnblock_flags_mask(page, (unsigned long)migratetype,
     553         262 :                                 page_to_pfn(page), MIGRATETYPE_MASK);
     554         262 : }
     555             : 
     556             : #ifdef CONFIG_DEBUG_VM
     557             : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
     558             : {
     559             :         int ret = 0;
     560             :         unsigned seq;
     561             :         unsigned long pfn = page_to_pfn(page);
     562             :         unsigned long sp, start_pfn;
     563             : 
     564             :         do {
     565             :                 seq = zone_span_seqbegin(zone);
     566             :                 start_pfn = zone->zone_start_pfn;
     567             :                 sp = zone->spanned_pages;
     568             :                 if (!zone_spans_pfn(zone, pfn))
     569             :                         ret = 1;
     570             :         } while (zone_span_seqretry(zone, seq));
     571             : 
     572             :         if (ret)
     573             :                 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
     574             :                         pfn, zone_to_nid(zone), zone->name,
     575             :                         start_pfn, start_pfn + sp);
     576             : 
     577             :         return ret;
     578             : }
     579             : 
     580             : static int page_is_consistent(struct zone *zone, struct page *page)
     581             : {
     582             :         if (zone != page_zone(page))
     583             :                 return 0;
     584             : 
     585             :         return 1;
     586             : }
     587             : /*
     588             :  * Temporary debugging check for pages not lying within a given zone.
     589             :  */
     590             : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
     591             : {
     592             :         if (page_outside_zone_boundaries(zone, page))
     593             :                 return 1;
     594             :         if (!page_is_consistent(zone, page))
     595             :                 return 1;
     596             : 
     597             :         return 0;
     598             : }
     599             : #else
     600             : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
     601             : {
     602             :         return 0;
     603             : }
     604             : #endif
     605             : 
     606           0 : static void bad_page(struct page *page, const char *reason)
     607             : {
     608             :         static unsigned long resume;
     609             :         static unsigned long nr_shown;
     610             :         static unsigned long nr_unshown;
     611             : 
     612             :         /*
     613             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     614             :          * or allow a steady drip of one report per second.
     615             :          */
     616           0 :         if (nr_shown == 60) {
     617           0 :                 if (time_before(jiffies, resume)) {
     618           0 :                         nr_unshown++;
     619           0 :                         goto out;
     620             :                 }
     621           0 :                 if (nr_unshown) {
     622           0 :                         pr_alert(
     623             :                               "BUG: Bad page state: %lu messages suppressed\n",
     624             :                                 nr_unshown);
     625           0 :                         nr_unshown = 0;
     626             :                 }
     627           0 :                 nr_shown = 0;
     628             :         }
     629           0 :         if (nr_shown++ == 0)
     630           0 :                 resume = jiffies + 60 * HZ;
     631             : 
     632           0 :         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
     633             :                 current->comm, page_to_pfn(page));
     634           0 :         dump_page(page, reason);
     635             : 
     636             :         print_modules();
     637           0 :         dump_stack();
     638             : out:
     639             :         /* Leave bad fields for debug, except PageBuddy could make trouble */
     640           0 :         page_mapcount_reset(page); /* remove PageBuddy */
     641           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     642           0 : }
     643             : 
     644             : static inline unsigned int order_to_pindex(int migratetype, int order)
     645             : {
     646         478 :         int base = order;
     647             : 
     648             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     649             :         if (order > PAGE_ALLOC_COSTLY_ORDER) {
     650             :                 VM_BUG_ON(order != pageblock_order);
     651             :                 base = PAGE_ALLOC_COSTLY_ORDER + 1;
     652             :         }
     653             : #else
     654             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     655             : #endif
     656             : 
     657         478 :         return (MIGRATE_PCPTYPES * base) + migratetype;
     658             : }
     659             : 
     660             : static inline int pindex_to_order(unsigned int pindex)
     661             : {
     662           0 :         int order = pindex / MIGRATE_PCPTYPES;
     663             : 
     664             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     665             :         if (order > PAGE_ALLOC_COSTLY_ORDER)
     666             :                 order = pageblock_order;
     667             : #else
     668             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     669             : #endif
     670             : 
     671             :         return order;
     672             : }
     673             : 
     674             : static inline bool pcp_allowed_order(unsigned int order)
     675             : {
     676         479 :         if (order <= PAGE_ALLOC_COSTLY_ORDER)
     677             :                 return true;
     678             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     679             :         if (order == pageblock_order)
     680             :                 return true;
     681             : #endif
     682             :         return false;
     683             : }
     684             : 
     685          11 : static inline void free_the_page(struct page *page, unsigned int order)
     686             : {
     687          11 :         if (pcp_allowed_order(order))           /* Via pcp? */
     688           3 :                 free_unref_page(page, order);
     689             :         else
     690           8 :                 __free_pages_ok(page, order, FPI_NONE);
     691          11 : }
     692             : 
     693             : /*
     694             :  * Higher-order pages are called "compound pages".  They are structured thusly:
     695             :  *
     696             :  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
     697             :  *
     698             :  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
     699             :  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
     700             :  *
     701             :  * The first tail page's ->compound_dtor holds the offset in array of compound
     702             :  * page destructors. See compound_page_dtors.
     703             :  *
     704             :  * The first tail page's ->compound_order holds the order of allocation.
     705             :  * This usage means that zero-order pages may not be compound.
     706             :  */
     707             : 
     708           0 : void free_compound_page(struct page *page)
     709             : {
     710           0 :         mem_cgroup_uncharge(page_folio(page));
     711           0 :         free_the_page(page, compound_order(page));
     712           0 : }
     713             : 
     714             : static void prep_compound_head(struct page *page, unsigned int order)
     715             : {
     716         109 :         set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
     717         109 :         set_compound_order(page, order);
     718         218 :         atomic_set(compound_mapcount_ptr(page), -1);
     719         218 :         atomic_set(compound_pincount_ptr(page), 0);
     720             : }
     721             : 
     722             : static void prep_compound_tail(struct page *head, int tail_idx)
     723             : {
     724         407 :         struct page *p = head + tail_idx;
     725             : 
     726         407 :         p->mapping = TAIL_MAPPING;
     727         407 :         set_compound_head(p, head);
     728             : }
     729             : 
     730           0 : void prep_compound_page(struct page *page, unsigned int order)
     731             : {
     732             :         int i;
     733         109 :         int nr_pages = 1 << order;
     734             : 
     735         109 :         __SetPageHead(page);
     736         516 :         for (i = 1; i < nr_pages; i++)
     737         407 :                 prep_compound_tail(page, i);
     738             : 
     739         109 :         prep_compound_head(page, order);
     740           0 : }
     741             : 
     742             : #ifdef CONFIG_DEBUG_PAGEALLOC
     743             : unsigned int _debug_guardpage_minorder;
     744             : 
     745             : bool _debug_pagealloc_enabled_early __read_mostly
     746             :                         = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
     747             : EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
     748             : DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
     749             : EXPORT_SYMBOL(_debug_pagealloc_enabled);
     750             : 
     751             : DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
     752             : 
     753             : static int __init early_debug_pagealloc(char *buf)
     754             : {
     755             :         return kstrtobool(buf, &_debug_pagealloc_enabled_early);
     756             : }
     757             : early_param("debug_pagealloc", early_debug_pagealloc);
     758             : 
     759             : static int __init debug_guardpage_minorder_setup(char *buf)
     760             : {
     761             :         unsigned long res;
     762             : 
     763             :         if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
     764             :                 pr_err("Bad debug_guardpage_minorder value\n");
     765             :                 return 0;
     766             :         }
     767             :         _debug_guardpage_minorder = res;
     768             :         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
     769             :         return 0;
     770             : }
     771             : early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
     772             : 
     773             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     774             :                                 unsigned int order, int migratetype)
     775             : {
     776             :         if (!debug_guardpage_enabled())
     777             :                 return false;
     778             : 
     779             :         if (order >= debug_guardpage_minorder())
     780             :                 return false;
     781             : 
     782             :         __SetPageGuard(page);
     783             :         INIT_LIST_HEAD(&page->lru);
     784             :         set_page_private(page, order);
     785             :         /* Guard pages are not available for any usage */
     786             :         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
     787             : 
     788             :         return true;
     789             : }
     790             : 
     791             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     792             :                                 unsigned int order, int migratetype)
     793             : {
     794             :         if (!debug_guardpage_enabled())
     795             :                 return;
     796             : 
     797             :         __ClearPageGuard(page);
     798             : 
     799             :         set_page_private(page, 0);
     800             :         if (!is_migrate_isolate(migratetype))
     801             :                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
     802             : }
     803             : #else
     804             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     805             :                         unsigned int order, int migratetype) { return false; }
     806             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     807             :                                 unsigned int order, int migratetype) {}
     808             : #endif
     809             : 
     810             : /*
     811             :  * Enable static keys related to various memory debugging and hardening options.
     812             :  * Some override others, and depend on early params that are evaluated in the
     813             :  * order of appearance. So we need to first gather the full picture of what was
     814             :  * enabled, and then make decisions.
     815             :  */
     816           1 : void init_mem_debugging_and_hardening(void)
     817             : {
     818           1 :         bool page_poisoning_requested = false;
     819             : 
     820             : #ifdef CONFIG_PAGE_POISONING
     821             :         /*
     822             :          * Page poisoning is debug page alloc for some arches. If
     823             :          * either of those options are enabled, enable poisoning.
     824             :          */
     825             :         if (page_poisoning_enabled() ||
     826             :              (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
     827             :               debug_pagealloc_enabled())) {
     828             :                 static_branch_enable(&_page_poisoning_enabled);
     829             :                 page_poisoning_requested = true;
     830             :         }
     831             : #endif
     832             : 
     833           1 :         if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
     834             :             page_poisoning_requested) {
     835             :                 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
     836             :                         "will take precedence over init_on_alloc and init_on_free\n");
     837             :                 _init_on_alloc_enabled_early = false;
     838             :                 _init_on_free_enabled_early = false;
     839             :         }
     840             : 
     841           1 :         if (_init_on_alloc_enabled_early)
     842           0 :                 static_branch_enable(&init_on_alloc);
     843             :         else
     844           1 :                 static_branch_disable(&init_on_alloc);
     845             : 
     846           1 :         if (_init_on_free_enabled_early)
     847           0 :                 static_branch_enable(&init_on_free);
     848             :         else
     849           1 :                 static_branch_disable(&init_on_free);
     850             : 
     851             : #ifdef CONFIG_DEBUG_PAGEALLOC
     852             :         if (!debug_pagealloc_enabled())
     853             :                 return;
     854             : 
     855             :         static_branch_enable(&_debug_pagealloc_enabled);
     856             : 
     857             :         if (!debug_guardpage_minorder())
     858             :                 return;
     859             : 
     860             :         static_branch_enable(&_debug_guardpage_enabled);
     861             : #endif
     862           1 : }
     863             : 
     864             : static inline void set_buddy_order(struct page *page, unsigned int order)
     865             : {
     866        1924 :         set_page_private(page, order);
     867         962 :         __SetPageBuddy(page);
     868             : }
     869             : 
     870             : /*
     871             :  * This function checks whether a page is free && is the buddy
     872             :  * we can coalesce a page and its buddy if
     873             :  * (a) the buddy is not in a hole (check before calling!) &&
     874             :  * (b) the buddy is in the buddy system &&
     875             :  * (c) a page and its buddy have the same order &&
     876             :  * (d) a page and its buddy are in the same zone.
     877             :  *
     878             :  * For recording whether a page is in the buddy system, we set PageBuddy.
     879             :  * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
     880             :  *
     881             :  * For recording page's order, we use page_private(page).
     882             :  */
     883             : static inline bool page_is_buddy(struct page *page, struct page *buddy,
     884             :                                                         unsigned int order)
     885             : {
     886          70 :         if (!page_is_guard(buddy) && !PageBuddy(buddy))
     887             :                 return false;
     888             : 
     889          32 :         if (buddy_order(buddy) != order)
     890             :                 return false;
     891             : 
     892             :         /*
     893             :          * zone check is done late to avoid uselessly calculating
     894             :          * zone/node ids for pages that could never merge.
     895             :          */
     896          48 :         if (page_zone_id(page) != page_zone_id(buddy))
     897             :                 return false;
     898             : 
     899             :         VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
     900             : 
     901             :         return true;
     902             : }
     903             : 
     904             : #ifdef CONFIG_COMPACTION
     905         263 : static inline struct capture_control *task_capc(struct zone *zone)
     906             : {
     907         263 :         struct capture_control *capc = current->capture_control;
     908             : 
     909         263 :         return unlikely(capc) &&
     910           0 :                 !(current->flags & PF_KTHREAD) &&
     911           0 :                 !capc->page &&
     912         526 :                 capc->cc->zone == zone ? capc : NULL;
     913             : }
     914             : 
     915             : static inline bool
     916             : compaction_capture(struct capture_control *capc, struct page *page,
     917             :                    int order, int migratetype)
     918             : {
     919          27 :         if (!capc || order != capc->cc->order)
     920             :                 return false;
     921             : 
     922             :         /* Do not accidentally pollute CMA or isolated regions*/
     923             :         if (is_migrate_cma(migratetype) ||
     924           0 :             is_migrate_isolate(migratetype))
     925             :                 return false;
     926             : 
     927             :         /*
     928             :          * Do not let lower order allocations pollute a movable pageblock.
     929             :          * This might let an unmovable request use a reclaimable pageblock
     930             :          * and vice-versa but no more than normal fallback logic which can
     931             :          * have trouble finding a high-order free page.
     932             :          */
     933           0 :         if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
     934             :                 return false;
     935             : 
     936           0 :         capc->page = page;
     937             :         return true;
     938             : }
     939             : 
     940             : #else
     941             : static inline struct capture_control *task_capc(struct zone *zone)
     942             : {
     943             :         return NULL;
     944             : }
     945             : 
     946             : static inline bool
     947             : compaction_capture(struct capture_control *capc, struct page *page,
     948             :                    int order, int migratetype)
     949             : {
     950             :         return false;
     951             : }
     952             : #endif /* CONFIG_COMPACTION */
     953             : 
     954             : /* Used for pages not on another list */
     955             : static inline void add_to_free_list(struct page *page, struct zone *zone,
     956             :                                     unsigned int order, int migratetype)
     957             : {
     958         699 :         struct free_area *area = &zone->free_area[order];
     959             : 
     960        1398 :         list_add(&page->lru, &area->free_list[migratetype]);
     961         699 :         area->nr_free++;
     962             : }
     963             : 
     964             : /* Used for pages not on another list */
     965             : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
     966             :                                          unsigned int order, int migratetype)
     967             : {
     968         263 :         struct free_area *area = &zone->free_area[order];
     969             : 
     970         526 :         list_add_tail(&page->lru, &area->free_list[migratetype]);
     971         263 :         area->nr_free++;
     972             : }
     973             : 
     974             : /*
     975             :  * Used for pages which are on another list. Move the pages to the tail
     976             :  * of the list - so the moved pages won't immediately be considered for
     977             :  * allocation again (e.g., optimization for memory onlining).
     978             :  */
     979             : static inline void move_to_free_list(struct page *page, struct zone *zone,
     980             :                                      unsigned int order, int migratetype)
     981             : {
     982           2 :         struct free_area *area = &zone->free_area[order];
     983             : 
     984           4 :         list_move_tail(&page->lru, &area->free_list[migratetype]);
     985             : }
     986             : 
     987             : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
     988             :                                            unsigned int order)
     989             : {
     990             :         /* clear reported state and update reported page count */
     991             :         if (page_reported(page))
     992             :                 __ClearPageReported(page);
     993             : 
     994        1398 :         list_del(&page->lru);
     995         699 :         __ClearPageBuddy(page);
     996        1398 :         set_page_private(page, 0);
     997         699 :         zone->free_area[order].nr_free--;
     998             : }
     999             : 
    1000             : /*
    1001             :  * If this is not the largest possible page, check if the buddy
    1002             :  * of the next-highest order is free. If it is, it's possible
    1003             :  * that pages are being freed that will coalesce soon. In case,
    1004             :  * that is happening, add the free page to the tail of the list
    1005             :  * so it's less likely to be used soon and more likely to be merged
    1006             :  * as a higher order page
    1007             :  */
    1008             : static inline bool
    1009           8 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
    1010             :                    struct page *page, unsigned int order)
    1011             : {
    1012             :         struct page *higher_page, *higher_buddy;
    1013             :         unsigned long combined_pfn;
    1014             : 
    1015           8 :         if (order >= MAX_ORDER - 2)
    1016             :                 return false;
    1017             : 
    1018           8 :         combined_pfn = buddy_pfn & pfn;
    1019           8 :         higher_page = page + (combined_pfn - pfn);
    1020          16 :         buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
    1021           8 :         higher_buddy = higher_page + (buddy_pfn - combined_pfn);
    1022             : 
    1023           8 :         return page_is_buddy(higher_page, higher_buddy, order + 1);
    1024             : }
    1025             : 
    1026             : /*
    1027             :  * Freeing function for a buddy system allocator.
    1028             :  *
    1029             :  * The concept of a buddy system is to maintain direct-mapped table
    1030             :  * (containing bit values) for memory blocks of various "orders".
    1031             :  * The bottom level table contains the map for the smallest allocatable
    1032             :  * units of memory (here, pages), and each level above it describes
    1033             :  * pairs of units from the levels below, hence, "buddies".
    1034             :  * At a high level, all that happens here is marking the table entry
    1035             :  * at the bottom level available, and propagating the changes upward
    1036             :  * as necessary, plus some accounting needed to play nicely with other
    1037             :  * parts of the VM system.
    1038             :  * At each level, we keep a list of pages, which are heads of continuous
    1039             :  * free pages of length of (1 << order) and marked with PageBuddy.
    1040             :  * Page's order is recorded in page_private(page) field.
    1041             :  * So when we are allocating or freeing one, we can derive the state of the
    1042             :  * other.  That is, if we allocate a small block, and both were
    1043             :  * free, the remainder of the region must be split into blocks.
    1044             :  * If a block is freed, and its buddy is also free, then this
    1045             :  * triggers coalescing into a block of larger size.
    1046             :  *
    1047             :  * -- nyc
    1048             :  */
    1049             : 
    1050         263 : static inline void __free_one_page(struct page *page,
    1051             :                 unsigned long pfn,
    1052             :                 struct zone *zone, unsigned int order,
    1053             :                 int migratetype, fpi_t fpi_flags)
    1054             : {
    1055         263 :         struct capture_control *capc = task_capc(zone);
    1056         263 :         unsigned int max_order = pageblock_order;
    1057             :         unsigned long buddy_pfn;
    1058             :         unsigned long combined_pfn;
    1059             :         struct page *buddy;
    1060             :         bool to_tail;
    1061             : 
    1062             :         VM_BUG_ON(!zone_is_initialized(zone));
    1063             :         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
    1064             : 
    1065             :         VM_BUG_ON(migratetype == -1);
    1066         263 :         if (likely(!is_migrate_isolate(migratetype)))
    1067         263 :                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
    1068             : 
    1069             :         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
    1070             :         VM_BUG_ON_PAGE(bad_range(zone, page), page);
    1071             : 
    1072             : continue_merging:
    1073         271 :         while (order < max_order) {
    1074          54 :                 if (compaction_capture(capc, page, order, migratetype)) {
    1075           0 :                         __mod_zone_freepage_state(zone, -(1 << order),
    1076             :                                                                 migratetype);
    1077             :                         return;
    1078             :                 }
    1079          27 :                 buddy_pfn = __find_buddy_pfn(pfn, order);
    1080          27 :                 buddy = page + (buddy_pfn - pfn);
    1081             : 
    1082          27 :                 if (!page_is_buddy(page, buddy, order))
    1083             :                         goto done_merging;
    1084             :                 /*
    1085             :                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
    1086             :                  * merge with it and move up one order.
    1087             :                  */
    1088           8 :                 if (page_is_guard(buddy))
    1089             :                         clear_page_guard(zone, buddy, order, migratetype);
    1090             :                 else
    1091             :                         del_page_from_free_list(buddy, zone, order);
    1092           8 :                 combined_pfn = buddy_pfn & pfn;
    1093           8 :                 page = page + (combined_pfn - pfn);
    1094           8 :                 pfn = combined_pfn;
    1095           8 :                 order++;
    1096             :         }
    1097         244 :         if (order < MAX_ORDER - 1) {
    1098             :                 /* If we are here, it means order is >= pageblock_order.
    1099             :                  * We want to prevent merge between freepages on pageblock
    1100             :                  * without fallbacks and normal pageblock. Without this,
    1101             :                  * pageblock isolation could cause incorrect freepage or CMA
    1102             :                  * accounting or HIGHATOMIC accounting.
    1103             :                  *
    1104             :                  * We don't want to hit this code for the more frequent
    1105             :                  * low-order merging.
    1106             :                  */
    1107             :                 int buddy_mt;
    1108             : 
    1109           0 :                 buddy_pfn = __find_buddy_pfn(pfn, order);
    1110           0 :                 buddy = page + (buddy_pfn - pfn);
    1111             : 
    1112           0 :                 if (!page_is_buddy(page, buddy, order))
    1113             :                         goto done_merging;
    1114           0 :                 buddy_mt = get_pageblock_migratetype(buddy);
    1115             : 
    1116           0 :                 if (migratetype != buddy_mt
    1117           0 :                                 && (!migratetype_is_mergeable(migratetype) ||
    1118           0 :                                         !migratetype_is_mergeable(buddy_mt)))
    1119             :                         goto done_merging;
    1120           0 :                 max_order = order + 1;
    1121           0 :                 goto continue_merging;
    1122             :         }
    1123             : 
    1124             : done_merging:
    1125         263 :         set_buddy_order(page, order);
    1126             : 
    1127         263 :         if (fpi_flags & FPI_TO_TAIL)
    1128             :                 to_tail = true;
    1129           8 :         else if (is_shuffle_order(order))
    1130             :                 to_tail = shuffle_pick_tail();
    1131             :         else
    1132           8 :                 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
    1133             : 
    1134         263 :         if (to_tail)
    1135             :                 add_to_free_list_tail(page, zone, order, migratetype);
    1136             :         else
    1137             :                 add_to_free_list(page, zone, order, migratetype);
    1138             : 
    1139             :         /* Notify page reporting subsystem of freed page */
    1140             :         if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
    1141             :                 page_reporting_notify_free(order);
    1142             : }
    1143             : 
    1144             : /*
    1145             :  * A bad page could be due to a number of fields. Instead of multiple branches,
    1146             :  * try and check multiple fields with one check. The caller must do a detailed
    1147             :  * check if necessary.
    1148             :  */
    1149             : static inline bool page_expected_state(struct page *page,
    1150             :                                         unsigned long check_flags)
    1151             : {
    1152      505120 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1153             :                 return false;
    1154             : 
    1155      505120 :         if (unlikely((unsigned long)page->mapping |
    1156             :                         page_ref_count(page) |
    1157             : #ifdef CONFIG_MEMCG
    1158             :                         page->memcg_data |
    1159             : #endif
    1160             :                         (page->flags & check_flags)))
    1161             :                 return false;
    1162             : 
    1163             :         return true;
    1164             : }
    1165             : 
    1166             : static const char *page_bad_reason(struct page *page, unsigned long flags)
    1167             : {
    1168           0 :         const char *bad_reason = NULL;
    1169             : 
    1170           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1171           0 :                 bad_reason = "nonzero mapcount";
    1172           0 :         if (unlikely(page->mapping != NULL))
    1173           0 :                 bad_reason = "non-NULL mapping";
    1174           0 :         if (unlikely(page_ref_count(page) != 0))
    1175           0 :                 bad_reason = "nonzero _refcount";
    1176           0 :         if (unlikely(page->flags & flags)) {
    1177             :                 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
    1178             :                         bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
    1179             :                 else
    1180           0 :                         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
    1181             :         }
    1182             : #ifdef CONFIG_MEMCG
    1183             :         if (unlikely(page->memcg_data))
    1184             :                 bad_reason = "page still charged to cgroup";
    1185             : #endif
    1186             :         return bad_reason;
    1187             : }
    1188             : 
    1189           0 : static void check_free_page_bad(struct page *page)
    1190             : {
    1191           0 :         bad_page(page,
    1192             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
    1193           0 : }
    1194             : 
    1195      251307 : static inline int check_free_page(struct page *page)
    1196             : {
    1197      251307 :         if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
    1198             :                 return 0;
    1199             : 
    1200             :         /* Something has gone sideways, find it */
    1201           0 :         check_free_page_bad(page);
    1202           0 :         return 1;
    1203             : }
    1204             : 
    1205             : static int free_tail_pages_check(struct page *head_page, struct page *page)
    1206             : {
    1207         251 :         int ret = 1;
    1208             : 
    1209             :         /*
    1210             :          * We rely page->lru.next never has bit 0 set, unless the page
    1211             :          * is PageTail(). Let's make sure that's true even for poisoned ->lru.
    1212             :          */
    1213             :         BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
    1214             : 
    1215             :         if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
    1216         251 :                 ret = 0;
    1217             :                 goto out;
    1218             :         }
    1219             :         switch (page - head_page) {
    1220             :         case 1:
    1221             :                 /* the first tail page: ->mapping may be compound_mapcount() */
    1222             :                 if (unlikely(compound_mapcount(page))) {
    1223             :                         bad_page(page, "nonzero compound_mapcount");
    1224             :                         goto out;
    1225             :                 }
    1226             :                 break;
    1227             :         case 2:
    1228             :                 /*
    1229             :                  * the second tail page: ->mapping is
    1230             :                  * deferred_list.next -- ignore value.
    1231             :                  */
    1232             :                 break;
    1233             :         default:
    1234             :                 if (page->mapping != TAIL_MAPPING) {
    1235             :                         bad_page(page, "corrupted mapping in tail page");
    1236             :                         goto out;
    1237             :                 }
    1238             :                 break;
    1239             :         }
    1240             :         if (unlikely(!PageTail(page))) {
    1241             :                 bad_page(page, "PageTail not set");
    1242             :                 goto out;
    1243             :         }
    1244             :         if (unlikely(compound_head(page) != head_page)) {
    1245             :                 bad_page(page, "compound_head not consistent");
    1246             :                 goto out;
    1247             :         }
    1248             :         ret = 0;
    1249             : out:
    1250         251 :         page->mapping = NULL;
    1251         251 :         clear_compound_head(page);
    1252             :         return ret;
    1253             : }
    1254             : 
    1255             : /*
    1256             :  * Skip KASAN memory poisoning when either:
    1257             :  *
    1258             :  * 1. Deferred memory initialization has not yet completed,
    1259             :  *    see the explanation below.
    1260             :  * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
    1261             :  *    see the comment next to it.
    1262             :  * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
    1263             :  *    see the comment next to it.
    1264             :  *
    1265             :  * Poisoning pages during deferred memory init will greatly lengthen the
    1266             :  * process and cause problem in large memory systems as the deferred pages
    1267             :  * initialization is done with interrupt disabled.
    1268             :  *
    1269             :  * Assuming that there will be no reference to those newly initialized
    1270             :  * pages before they are ever allocated, this should have no effect on
    1271             :  * KASAN memory tracking as the poison will be properly inserted at page
    1272             :  * allocation time. The only corner case is when pages are allocated by
    1273             :  * on-demand allocation and then freed again before the deferred pages
    1274             :  * initialization is done, but this is not likely to happen.
    1275             :  */
    1276             : static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
    1277             : {
    1278             :         return deferred_pages_enabled() ||
    1279             :                (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
    1280             :                 (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
    1281             :                PageSkipKASanPoison(page);
    1282             : }
    1283             : 
    1284           0 : static void kernel_init_free_pages(struct page *page, int numpages)
    1285             : {
    1286             :         int i;
    1287             : 
    1288             :         /* s390's use of memset() could override KASAN redzones. */
    1289             :         kasan_disable_current();
    1290         334 :         for (i = 0; i < numpages; i++) {
    1291         334 :                 u8 tag = page_kasan_tag(page + i);
    1292         334 :                 page_kasan_tag_reset(page + i);
    1293         334 :                 clear_highpage(page + i);
    1294         334 :                 page_kasan_tag_set(page + i, tag);
    1295             :         }
    1296             :         kasan_enable_current();
    1297           0 : }
    1298             : 
    1299             : static __always_inline bool free_pages_prepare(struct page *page,
    1300             :                         unsigned int order, bool check_free, fpi_t fpi_flags)
    1301             : {
    1302         266 :         int bad = 0;
    1303         266 :         bool init = want_init_on_free();
    1304             : 
    1305             :         VM_BUG_ON_PAGE(PageTail(page), page);
    1306             : 
    1307         266 :         trace_mm_page_free(page, order);
    1308             : 
    1309         266 :         if (unlikely(PageHWPoison(page)) && !order) {
    1310             :                 /*
    1311             :                  * Do not let hwpoison pages hit pcplists/buddy
    1312             :                  * Untie memcg state and reset page's owner
    1313             :                  */
    1314             :                 if (memcg_kmem_enabled() && PageMemcgKmem(page))
    1315             :                         __memcg_kmem_uncharge_page(page, order);
    1316             :                 reset_page_owner(page, order);
    1317             :                 page_table_check_free(page, order);
    1318             :                 return false;
    1319             :         }
    1320             : 
    1321             :         /*
    1322             :          * Check tail pages before head page information is cleared to
    1323             :          * avoid checking PageCompound for order-0 pages.
    1324             :          */
    1325         266 :         if (unlikely(order)) {
    1326         264 :                 bool compound = PageCompound(page);
    1327             :                 int i;
    1328             : 
    1329             :                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
    1330             : 
    1331             :                 if (compound) {
    1332             :                         ClearPageDoubleMap(page);
    1333             :                         ClearPageHasHWPoisoned(page);
    1334             :                 }
    1335      251044 :                 for (i = 1; i < (1 << order); i++) {
    1336      251044 :                         if (compound)
    1337         502 :                                 bad += free_tail_pages_check(page, page + i);
    1338      251044 :                         if (unlikely(check_free_page(page + i))) {
    1339           0 :                                 bad++;
    1340           0 :                                 continue;
    1341             :                         }
    1342      251044 :                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1343             :                 }
    1344             :         }
    1345         266 :         if (PageMappingFlags(page))
    1346           0 :                 page->mapping = NULL;
    1347             :         if (memcg_kmem_enabled() && PageMemcgKmem(page))
    1348             :                 __memcg_kmem_uncharge_page(page, order);
    1349             :         if (check_free)
    1350         263 :                 bad += check_free_page(page);
    1351         266 :         if (bad)
    1352             :                 return false;
    1353             : 
    1354         266 :         page_cpupid_reset_last(page);
    1355         266 :         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1356             :         reset_page_owner(page, order);
    1357         266 :         page_table_check_free(page, order);
    1358             : 
    1359         266 :         if (!PageHighMem(page)) {
    1360             :                 debug_check_no_locks_freed(page_address(page),
    1361             :                                            PAGE_SIZE << order);
    1362             :                 debug_check_no_obj_freed(page_address(page),
    1363             :                                            PAGE_SIZE << order);
    1364             :         }
    1365             : 
    1366         266 :         kernel_poison_pages(page, 1 << order);
    1367             : 
    1368             :         /*
    1369             :          * As memory initialization might be integrated into KASAN,
    1370             :          * KASAN poisoning and memory initialization code must be
    1371             :          * kept together to avoid discrepancies in behavior.
    1372             :          *
    1373             :          * With hardware tag-based KASAN, memory tags must be set before the
    1374             :          * page becomes unavailable via debug_pagealloc or arch_free_page.
    1375             :          */
    1376         266 :         if (!should_skip_kasan_poison(page, fpi_flags)) {
    1377             :                 kasan_poison_pages(page, order, init);
    1378             : 
    1379             :                 /* Memory is already initialized if KASAN did it internally. */
    1380             :                 if (kasan_has_integrated_init())
    1381             :                         init = false;
    1382             :         }
    1383         266 :         if (init)
    1384           0 :                 kernel_init_free_pages(page, 1 << order);
    1385             : 
    1386             :         /*
    1387             :          * arch_free_page() can make the page's contents inaccessible.  s390
    1388             :          * does this.  So nothing which can access the page's contents should
    1389             :          * happen after this.
    1390             :          */
    1391             :         arch_free_page(page, order);
    1392             : 
    1393             :         debug_pagealloc_unmap_pages(page, 1 << order);
    1394             : 
    1395             :         return true;
    1396             : }
    1397             : 
    1398             : #ifdef CONFIG_DEBUG_VM
    1399             : /*
    1400             :  * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
    1401             :  * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
    1402             :  * moved from pcp lists to free lists.
    1403             :  */
    1404             : static bool free_pcp_prepare(struct page *page, unsigned int order)
    1405             : {
    1406             :         return free_pages_prepare(page, order, true, FPI_NONE);
    1407             : }
    1408             : 
    1409             : static bool bulkfree_pcp_prepare(struct page *page)
    1410             : {
    1411             :         if (debug_pagealloc_enabled_static())
    1412             :                 return check_free_page(page);
    1413             :         else
    1414             :                 return false;
    1415             : }
    1416             : #else
    1417             : /*
    1418             :  * With DEBUG_VM disabled, order-0 pages being freed are checked only when
    1419             :  * moving from pcp lists to free list in order to reduce overhead. With
    1420             :  * debug_pagealloc enabled, they are checked also immediately when being freed
    1421             :  * to the pcp lists.
    1422             :  */
    1423           3 : static bool free_pcp_prepare(struct page *page, unsigned int order)
    1424             : {
    1425             :         if (debug_pagealloc_enabled_static())
    1426             :                 return free_pages_prepare(page, order, true, FPI_NONE);
    1427             :         else
    1428           3 :                 return free_pages_prepare(page, order, false, FPI_NONE);
    1429             : }
    1430             : 
    1431             : static bool bulkfree_pcp_prepare(struct page *page)
    1432             : {
    1433           0 :         return check_free_page(page);
    1434             : }
    1435             : #endif /* CONFIG_DEBUG_VM */
    1436             : 
    1437             : /*
    1438             :  * Frees a number of pages from the PCP lists
    1439             :  * Assumes all pages on list are in same zone.
    1440             :  * count is the number of pages to free.
    1441             :  */
    1442           0 : static void free_pcppages_bulk(struct zone *zone, int count,
    1443             :                                         struct per_cpu_pages *pcp,
    1444             :                                         int pindex)
    1445             : {
    1446           0 :         int min_pindex = 0;
    1447           0 :         int max_pindex = NR_PCP_LISTS - 1;
    1448             :         unsigned int order;
    1449             :         bool isolated_pageblocks;
    1450             :         struct page *page;
    1451             : 
    1452             :         /*
    1453             :          * Ensure proper count is passed which otherwise would stuck in the
    1454             :          * below while (list_empty(list)) loop.
    1455             :          */
    1456           0 :         count = min(pcp->count, count);
    1457             : 
    1458             :         /* Ensure requested pindex is drained first. */
    1459           0 :         pindex = pindex - 1;
    1460             : 
    1461             :         /*
    1462             :          * local_lock_irq held so equivalent to spin_lock_irqsave for
    1463             :          * both PREEMPT_RT and non-PREEMPT_RT configurations.
    1464             :          */
    1465           0 :         spin_lock(&zone->lock);
    1466           0 :         isolated_pageblocks = has_isolate_pageblock(zone);
    1467             : 
    1468           0 :         while (count > 0) {
    1469             :                 struct list_head *list;
    1470             :                 int nr_pages;
    1471             : 
    1472             :                 /* Remove pages from lists in a round-robin fashion. */
    1473             :                 do {
    1474           0 :                         if (++pindex > max_pindex)
    1475           0 :                                 pindex = min_pindex;
    1476           0 :                         list = &pcp->lists[pindex];
    1477           0 :                         if (!list_empty(list))
    1478             :                                 break;
    1479             : 
    1480           0 :                         if (pindex == max_pindex)
    1481           0 :                                 max_pindex--;
    1482           0 :                         if (pindex == min_pindex)
    1483           0 :                                 min_pindex++;
    1484             :                 } while (1);
    1485             : 
    1486           0 :                 order = pindex_to_order(pindex);
    1487           0 :                 nr_pages = 1 << order;
    1488             :                 BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
    1489             :                 do {
    1490             :                         int mt;
    1491             : 
    1492           0 :                         page = list_last_entry(list, struct page, lru);
    1493           0 :                         mt = get_pcppage_migratetype(page);
    1494             : 
    1495             :                         /* must delete to avoid corrupting pcp list */
    1496           0 :                         list_del(&page->lru);
    1497           0 :                         count -= nr_pages;
    1498           0 :                         pcp->count -= nr_pages;
    1499             : 
    1500           0 :                         if (bulkfree_pcp_prepare(page))
    1501           0 :                                 continue;
    1502             : 
    1503             :                         /* MIGRATE_ISOLATE page should not go to pcplists */
    1504             :                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
    1505             :                         /* Pageblock could have been isolated meanwhile */
    1506             :                         if (unlikely(isolated_pageblocks))
    1507             :                                 mt = get_pageblock_migratetype(page);
    1508             : 
    1509           0 :                         __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
    1510           0 :                         trace_mm_page_pcpu_drain(page, order, mt);
    1511           0 :                 } while (count > 0 && !list_empty(list));
    1512             :         }
    1513             : 
    1514           0 :         spin_unlock(&zone->lock);
    1515           0 : }
    1516             : 
    1517             : static void free_one_page(struct zone *zone,
    1518             :                                 struct page *page, unsigned long pfn,
    1519             :                                 unsigned int order,
    1520             :                                 int migratetype, fpi_t fpi_flags)
    1521             : {
    1522             :         unsigned long flags;
    1523             : 
    1524             :         spin_lock_irqsave(&zone->lock, flags);
    1525             :         if (unlikely(has_isolate_pageblock(zone) ||
    1526             :                 is_migrate_isolate(migratetype))) {
    1527             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1528             :         }
    1529             :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1530             :         spin_unlock_irqrestore(&zone->lock, flags);
    1531             : }
    1532             : 
    1533      266125 : static void __meminit __init_single_page(struct page *page, unsigned long pfn,
    1534             :                                 unsigned long zone, int nid)
    1535             : {
    1536      266125 :         mm_zero_struct_page(page);
    1537      532250 :         set_page_links(page, zone, nid, pfn);
    1538      266125 :         init_page_count(page);
    1539      266125 :         page_mapcount_reset(page);
    1540      266125 :         page_cpupid_reset_last(page);
    1541      266125 :         page_kasan_tag_reset(page);
    1542             : 
    1543      532250 :         INIT_LIST_HEAD(&page->lru);
    1544             : #ifdef WANT_PAGE_VIRTUAL
    1545             :         /* The shift won't overflow because ZONE_NORMAL is below 4G. */
    1546             :         if (!is_highmem_idx(zone))
    1547             :                 set_page_address(page, __va(pfn << PAGE_SHIFT));
    1548             : #endif
    1549      266125 : }
    1550             : 
    1551             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    1552             : static void __meminit init_reserved_page(unsigned long pfn)
    1553             : {
    1554             :         pg_data_t *pgdat;
    1555             :         int nid, zid;
    1556             : 
    1557             :         if (!early_page_uninitialised(pfn))
    1558             :                 return;
    1559             : 
    1560             :         nid = early_pfn_to_nid(pfn);
    1561             :         pgdat = NODE_DATA(nid);
    1562             : 
    1563             :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    1564             :                 struct zone *zone = &pgdat->node_zones[zid];
    1565             : 
    1566             :                 if (zone_spans_pfn(zone, pfn))
    1567             :                         break;
    1568             :         }
    1569             :         __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
    1570             : }
    1571             : #else
    1572             : static inline void init_reserved_page(unsigned long pfn)
    1573             : {
    1574             : }
    1575             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
    1576             : 
    1577             : /*
    1578             :  * Initialised pages do not have PageReserved set. This function is
    1579             :  * called for each range allocated by the bootmem allocator and
    1580             :  * marks the pages PageReserved. The remaining valid pages are later
    1581             :  * sent to the buddy page allocator.
    1582             :  */
    1583          13 : void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
    1584             : {
    1585          13 :         unsigned long start_pfn = PFN_DOWN(start);
    1586          13 :         unsigned long end_pfn = PFN_UP(end);
    1587             : 
    1588       15098 :         for (; start_pfn < end_pfn; start_pfn++) {
    1589       15085 :                 if (pfn_valid(start_pfn)) {
    1590       15085 :                         struct page *page = pfn_to_page(start_pfn);
    1591             : 
    1592       15085 :                         init_reserved_page(start_pfn);
    1593             : 
    1594             :                         /* Avoid false-positive PageTail() */
    1595       30170 :                         INIT_LIST_HEAD(&page->lru);
    1596             : 
    1597             :                         /*
    1598             :                          * no need for atomic set_bit because the struct
    1599             :                          * page is not visible yet so nobody should
    1600             :                          * access it yet.
    1601             :                          */
    1602             :                         __SetPageReserved(page);
    1603             :                 }
    1604             :         }
    1605          13 : }
    1606             : 
    1607         263 : static void __free_pages_ok(struct page *page, unsigned int order,
    1608             :                             fpi_t fpi_flags)
    1609             : {
    1610             :         unsigned long flags;
    1611             :         int migratetype;
    1612         263 :         unsigned long pfn = page_to_pfn(page);
    1613         263 :         struct zone *zone = page_zone(page);
    1614             : 
    1615         263 :         if (!free_pages_prepare(page, order, true, fpi_flags))
    1616             :                 return;
    1617             : 
    1618         263 :         migratetype = get_pfnblock_migratetype(page, pfn);
    1619             : 
    1620         263 :         spin_lock_irqsave(&zone->lock, flags);
    1621             :         if (unlikely(has_isolate_pageblock(zone) ||
    1622             :                 is_migrate_isolate(migratetype))) {
    1623             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1624             :         }
    1625         263 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1626         526 :         spin_unlock_irqrestore(&zone->lock, flags);
    1627             : 
    1628         263 :         __count_vm_events(PGFREE, 1 << order);
    1629             : }
    1630             : 
    1631         255 : void __free_pages_core(struct page *page, unsigned int order)
    1632             : {
    1633         255 :         unsigned int nr_pages = 1 << order;
    1634         255 :         struct page *p = page;
    1635             :         unsigned int loop;
    1636             : 
    1637             :         /*
    1638             :          * When initializing the memmap, __init_single_page() sets the refcount
    1639             :          * of all pages to 1 ("allocated"/"not free"). We have to set the
    1640             :          * refcount of all involved pages to 0.
    1641             :          */
    1642         255 :         prefetchw(p);
    1643      251048 :         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
    1644      250793 :                 prefetchw(p + 1);
    1645      250793 :                 __ClearPageReserved(p);
    1646      250793 :                 set_page_count(p, 0);
    1647             :         }
    1648         255 :         __ClearPageReserved(p);
    1649         255 :         set_page_count(p, 0);
    1650             : 
    1651         510 :         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
    1652             : 
    1653             :         /*
    1654             :          * Bypass PCP and place fresh pages right to the tail, primarily
    1655             :          * relevant for memory onlining.
    1656             :          */
    1657         255 :         __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
    1658         255 : }
    1659             : 
    1660             : #ifdef CONFIG_NUMA
    1661             : 
    1662             : /*
    1663             :  * During memory init memblocks map pfns to nids. The search is expensive and
    1664             :  * this caches recent lookups. The implementation of __early_pfn_to_nid
    1665             :  * treats start/end as pfns.
    1666             :  */
    1667             : struct mminit_pfnnid_cache {
    1668             :         unsigned long last_start;
    1669             :         unsigned long last_end;
    1670             :         int last_nid;
    1671             : };
    1672             : 
    1673             : static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
    1674             : 
    1675             : /*
    1676             :  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
    1677             :  */
    1678             : static int __meminit __early_pfn_to_nid(unsigned long pfn,
    1679             :                                         struct mminit_pfnnid_cache *state)
    1680             : {
    1681             :         unsigned long start_pfn, end_pfn;
    1682             :         int nid;
    1683             : 
    1684             :         if (state->last_start <= pfn && pfn < state->last_end)
    1685             :                 return state->last_nid;
    1686             : 
    1687             :         nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
    1688             :         if (nid != NUMA_NO_NODE) {
    1689             :                 state->last_start = start_pfn;
    1690             :                 state->last_end = end_pfn;
    1691             :                 state->last_nid = nid;
    1692             :         }
    1693             : 
    1694             :         return nid;
    1695             : }
    1696             : 
    1697             : int __meminit early_pfn_to_nid(unsigned long pfn)
    1698             : {
    1699             :         static DEFINE_SPINLOCK(early_pfn_lock);
    1700             :         int nid;
    1701             : 
    1702             :         spin_lock(&early_pfn_lock);
    1703             :         nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
    1704             :         if (nid < 0)
    1705             :                 nid = first_online_node;
    1706             :         spin_unlock(&early_pfn_lock);
    1707             : 
    1708             :         return nid;
    1709             : }
    1710             : #endif /* CONFIG_NUMA */
    1711             : 
    1712         255 : void __init memblock_free_pages(struct page *page, unsigned long pfn,
    1713             :                                                         unsigned int order)
    1714             : {
    1715         255 :         if (early_page_uninitialised(pfn))
    1716             :                 return;
    1717         255 :         __free_pages_core(page, order);
    1718             : }
    1719             : 
    1720             : /*
    1721             :  * Check that the whole (or subset of) a pageblock given by the interval of
    1722             :  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
    1723             :  * with the migration of free compaction scanner.
    1724             :  *
    1725             :  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
    1726             :  *
    1727             :  * It's possible on some configurations to have a setup like node0 node1 node0
    1728             :  * i.e. it's possible that all pages within a zones range of pages do not
    1729             :  * belong to a single zone. We assume that a border between node0 and node1
    1730             :  * can occur within a single pageblock, but not a node0 node1 node0
    1731             :  * interleaving within a single pageblock. It is therefore sufficient to check
    1732             :  * the first and last page of a pageblock and avoid checking each individual
    1733             :  * page in a pageblock.
    1734             :  */
    1735         260 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
    1736             :                                      unsigned long end_pfn, struct zone *zone)
    1737             : {
    1738             :         struct page *start_page;
    1739             :         struct page *end_page;
    1740             : 
    1741             :         /* end_pfn is one past the range we are checking */
    1742         260 :         end_pfn--;
    1743             : 
    1744         260 :         if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
    1745             :                 return NULL;
    1746             : 
    1747         260 :         start_page = pfn_to_online_page(start_pfn);
    1748         260 :         if (!start_page)
    1749             :                 return NULL;
    1750             : 
    1751         260 :         if (page_zone(start_page) != zone)
    1752             :                 return NULL;
    1753             : 
    1754         260 :         end_page = pfn_to_page(end_pfn);
    1755             : 
    1756             :         /* This gives a shorter code than deriving page_zone(end_page) */
    1757         780 :         if (page_zone_id(start_page) != page_zone_id(end_page))
    1758             :                 return NULL;
    1759             : 
    1760         260 :         return start_page;
    1761             : }
    1762             : 
    1763           1 : void set_zone_contiguous(struct zone *zone)
    1764             : {
    1765           1 :         unsigned long block_start_pfn = zone->zone_start_pfn;
    1766             :         unsigned long block_end_pfn;
    1767             : 
    1768           1 :         block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
    1769         523 :         for (; block_start_pfn < zone_end_pfn(zone);
    1770         260 :                         block_start_pfn = block_end_pfn,
    1771         260 :                          block_end_pfn += pageblock_nr_pages) {
    1772             : 
    1773         260 :                 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
    1774             : 
    1775         260 :                 if (!__pageblock_pfn_to_page(block_start_pfn,
    1776             :                                              block_end_pfn, zone))
    1777             :                         return;
    1778         260 :                 cond_resched();
    1779             :         }
    1780             : 
    1781             :         /* We confirm that there is no hole */
    1782           1 :         zone->contiguous = true;
    1783             : }
    1784             : 
    1785           0 : void clear_zone_contiguous(struct zone *zone)
    1786             : {
    1787           0 :         zone->contiguous = false;
    1788           0 : }
    1789             : 
    1790             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    1791             : static void __init deferred_free_range(unsigned long pfn,
    1792             :                                        unsigned long nr_pages)
    1793             : {
    1794             :         struct page *page;
    1795             :         unsigned long i;
    1796             : 
    1797             :         if (!nr_pages)
    1798             :                 return;
    1799             : 
    1800             :         page = pfn_to_page(pfn);
    1801             : 
    1802             :         /* Free a large naturally-aligned chunk if possible */
    1803             :         if (nr_pages == pageblock_nr_pages &&
    1804             :             (pfn & (pageblock_nr_pages - 1)) == 0) {
    1805             :                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    1806             :                 __free_pages_core(page, pageblock_order);
    1807             :                 return;
    1808             :         }
    1809             : 
    1810             :         for (i = 0; i < nr_pages; i++, page++, pfn++) {
    1811             :                 if ((pfn & (pageblock_nr_pages - 1)) == 0)
    1812             :                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    1813             :                 __free_pages_core(page, 0);
    1814             :         }
    1815             : }
    1816             : 
    1817             : /* Completion tracking for deferred_init_memmap() threads */
    1818             : static atomic_t pgdat_init_n_undone __initdata;
    1819             : static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
    1820             : 
    1821             : static inline void __init pgdat_init_report_one_done(void)
    1822             : {
    1823             :         if (atomic_dec_and_test(&pgdat_init_n_undone))
    1824             :                 complete(&pgdat_init_all_done_comp);
    1825             : }
    1826             : 
    1827             : /*
    1828             :  * Returns true if page needs to be initialized or freed to buddy allocator.
    1829             :  *
    1830             :  * First we check if pfn is valid on architectures where it is possible to have
    1831             :  * holes within pageblock_nr_pages. On systems where it is not possible, this
    1832             :  * function is optimized out.
    1833             :  *
    1834             :  * Then, we check if a current large page is valid by only checking the validity
    1835             :  * of the head pfn.
    1836             :  */
    1837             : static inline bool __init deferred_pfn_valid(unsigned long pfn)
    1838             : {
    1839             :         if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
    1840             :                 return false;
    1841             :         return true;
    1842             : }
    1843             : 
    1844             : /*
    1845             :  * Free pages to buddy allocator. Try to free aligned pages in
    1846             :  * pageblock_nr_pages sizes.
    1847             :  */
    1848             : static void __init deferred_free_pages(unsigned long pfn,
    1849             :                                        unsigned long end_pfn)
    1850             : {
    1851             :         unsigned long nr_pgmask = pageblock_nr_pages - 1;
    1852             :         unsigned long nr_free = 0;
    1853             : 
    1854             :         for (; pfn < end_pfn; pfn++) {
    1855             :                 if (!deferred_pfn_valid(pfn)) {
    1856             :                         deferred_free_range(pfn - nr_free, nr_free);
    1857             :                         nr_free = 0;
    1858             :                 } else if (!(pfn & nr_pgmask)) {
    1859             :                         deferred_free_range(pfn - nr_free, nr_free);
    1860             :                         nr_free = 1;
    1861             :                 } else {
    1862             :                         nr_free++;
    1863             :                 }
    1864             :         }
    1865             :         /* Free the last block of pages to allocator */
    1866             :         deferred_free_range(pfn - nr_free, nr_free);
    1867             : }
    1868             : 
    1869             : /*
    1870             :  * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
    1871             :  * by performing it only once every pageblock_nr_pages.
    1872             :  * Return number of pages initialized.
    1873             :  */
    1874             : static unsigned long  __init deferred_init_pages(struct zone *zone,
    1875             :                                                  unsigned long pfn,
    1876             :                                                  unsigned long end_pfn)
    1877             : {
    1878             :         unsigned long nr_pgmask = pageblock_nr_pages - 1;
    1879             :         int nid = zone_to_nid(zone);
    1880             :         unsigned long nr_pages = 0;
    1881             :         int zid = zone_idx(zone);
    1882             :         struct page *page = NULL;
    1883             : 
    1884             :         for (; pfn < end_pfn; pfn++) {
    1885             :                 if (!deferred_pfn_valid(pfn)) {
    1886             :                         page = NULL;
    1887             :                         continue;
    1888             :                 } else if (!page || !(pfn & nr_pgmask)) {
    1889             :                         page = pfn_to_page(pfn);
    1890             :                 } else {
    1891             :                         page++;
    1892             :                 }
    1893             :                 __init_single_page(page, pfn, zid, nid);
    1894             :                 nr_pages++;
    1895             :         }
    1896             :         return (nr_pages);
    1897             : }
    1898             : 
    1899             : /*
    1900             :  * This function is meant to pre-load the iterator for the zone init.
    1901             :  * Specifically it walks through the ranges until we are caught up to the
    1902             :  * first_init_pfn value and exits there. If we never encounter the value we
    1903             :  * return false indicating there are no valid ranges left.
    1904             :  */
    1905             : static bool __init
    1906             : deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
    1907             :                                     unsigned long *spfn, unsigned long *epfn,
    1908             :                                     unsigned long first_init_pfn)
    1909             : {
    1910             :         u64 j;
    1911             : 
    1912             :         /*
    1913             :          * Start out by walking through the ranges in this zone that have
    1914             :          * already been initialized. We don't need to do anything with them
    1915             :          * so we just need to flush them out of the system.
    1916             :          */
    1917             :         for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
    1918             :                 if (*epfn <= first_init_pfn)
    1919             :                         continue;
    1920             :                 if (*spfn < first_init_pfn)
    1921             :                         *spfn = first_init_pfn;
    1922             :                 *i = j;
    1923             :                 return true;
    1924             :         }
    1925             : 
    1926             :         return false;
    1927             : }
    1928             : 
    1929             : /*
    1930             :  * Initialize and free pages. We do it in two loops: first we initialize
    1931             :  * struct page, then free to buddy allocator, because while we are
    1932             :  * freeing pages we can access pages that are ahead (computing buddy
    1933             :  * page in __free_one_page()).
    1934             :  *
    1935             :  * In order to try and keep some memory in the cache we have the loop
    1936             :  * broken along max page order boundaries. This way we will not cause
    1937             :  * any issues with the buddy page computation.
    1938             :  */
    1939             : static unsigned long __init
    1940             : deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
    1941             :                        unsigned long *end_pfn)
    1942             : {
    1943             :         unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
    1944             :         unsigned long spfn = *start_pfn, epfn = *end_pfn;
    1945             :         unsigned long nr_pages = 0;
    1946             :         u64 j = *i;
    1947             : 
    1948             :         /* First we loop through and initialize the page values */
    1949             :         for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
    1950             :                 unsigned long t;
    1951             : 
    1952             :                 if (mo_pfn <= *start_pfn)
    1953             :                         break;
    1954             : 
    1955             :                 t = min(mo_pfn, *end_pfn);
    1956             :                 nr_pages += deferred_init_pages(zone, *start_pfn, t);
    1957             : 
    1958             :                 if (mo_pfn < *end_pfn) {
    1959             :                         *start_pfn = mo_pfn;
    1960             :                         break;
    1961             :                 }
    1962             :         }
    1963             : 
    1964             :         /* Reset values and now loop through freeing pages as needed */
    1965             :         swap(j, *i);
    1966             : 
    1967             :         for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
    1968             :                 unsigned long t;
    1969             : 
    1970             :                 if (mo_pfn <= spfn)
    1971             :                         break;
    1972             : 
    1973             :                 t = min(mo_pfn, epfn);
    1974             :                 deferred_free_pages(spfn, t);
    1975             : 
    1976             :                 if (mo_pfn <= epfn)
    1977             :                         break;
    1978             :         }
    1979             : 
    1980             :         return nr_pages;
    1981             : }
    1982             : 
    1983             : static void __init
    1984             : deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
    1985             :                            void *arg)
    1986             : {
    1987             :         unsigned long spfn, epfn;
    1988             :         struct zone *zone = arg;
    1989             :         u64 i;
    1990             : 
    1991             :         deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
    1992             : 
    1993             :         /*
    1994             :          * Initialize and free pages in MAX_ORDER sized increments so that we
    1995             :          * can avoid introducing any issues with the buddy allocator.
    1996             :          */
    1997             :         while (spfn < end_pfn) {
    1998             :                 deferred_init_maxorder(&i, zone, &spfn, &epfn);
    1999             :                 cond_resched();
    2000             :         }
    2001             : }
    2002             : 
    2003             : /* An arch may override for more concurrency. */
    2004             : __weak int __init
    2005             : deferred_page_init_max_threads(const struct cpumask *node_cpumask)
    2006             : {
    2007             :         return 1;
    2008             : }
    2009             : 
    2010             : /* Initialise remaining memory on a node */
    2011             : static int __init deferred_init_memmap(void *data)
    2012             : {
    2013             :         pg_data_t *pgdat = data;
    2014             :         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
    2015             :         unsigned long spfn = 0, epfn = 0;
    2016             :         unsigned long first_init_pfn, flags;
    2017             :         unsigned long start = jiffies;
    2018             :         struct zone *zone;
    2019             :         int zid, max_threads;
    2020             :         u64 i;
    2021             : 
    2022             :         /* Bind memory initialisation thread to a local node if possible */
    2023             :         if (!cpumask_empty(cpumask))
    2024             :                 set_cpus_allowed_ptr(current, cpumask);
    2025             : 
    2026             :         pgdat_resize_lock(pgdat, &flags);
    2027             :         first_init_pfn = pgdat->first_deferred_pfn;
    2028             :         if (first_init_pfn == ULONG_MAX) {
    2029             :                 pgdat_resize_unlock(pgdat, &flags);
    2030             :                 pgdat_init_report_one_done();
    2031             :                 return 0;
    2032             :         }
    2033             : 
    2034             :         /* Sanity check boundaries */
    2035             :         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
    2036             :         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
    2037             :         pgdat->first_deferred_pfn = ULONG_MAX;
    2038             : 
    2039             :         /*
    2040             :          * Once we unlock here, the zone cannot be grown anymore, thus if an
    2041             :          * interrupt thread must allocate this early in boot, zone must be
    2042             :          * pre-grown prior to start of deferred page initialization.
    2043             :          */
    2044             :         pgdat_resize_unlock(pgdat, &flags);
    2045             : 
    2046             :         /* Only the highest zone is deferred so find it */
    2047             :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    2048             :                 zone = pgdat->node_zones + zid;
    2049             :                 if (first_init_pfn < zone_end_pfn(zone))
    2050             :                         break;
    2051             :         }
    2052             : 
    2053             :         /* If the zone is empty somebody else may have cleared out the zone */
    2054             :         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2055             :                                                  first_init_pfn))
    2056             :                 goto zone_empty;
    2057             : 
    2058             :         max_threads = deferred_page_init_max_threads(cpumask);
    2059             : 
    2060             :         while (spfn < epfn) {
    2061             :                 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
    2062             :                 struct padata_mt_job job = {
    2063             :                         .thread_fn   = deferred_init_memmap_chunk,
    2064             :                         .fn_arg      = zone,
    2065             :                         .start       = spfn,
    2066             :                         .size        = epfn_align - spfn,
    2067             :                         .align       = PAGES_PER_SECTION,
    2068             :                         .min_chunk   = PAGES_PER_SECTION,
    2069             :                         .max_threads = max_threads,
    2070             :                 };
    2071             : 
    2072             :                 padata_do_multithreaded(&job);
    2073             :                 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2074             :                                                     epfn_align);
    2075             :         }
    2076             : zone_empty:
    2077             :         /* Sanity check that the next zone really is unpopulated */
    2078             :         WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
    2079             : 
    2080             :         pr_info("node %d deferred pages initialised in %ums\n",
    2081             :                 pgdat->node_id, jiffies_to_msecs(jiffies - start));
    2082             : 
    2083             :         pgdat_init_report_one_done();
    2084             :         return 0;
    2085             : }
    2086             : 
    2087             : /*
    2088             :  * If this zone has deferred pages, try to grow it by initializing enough
    2089             :  * deferred pages to satisfy the allocation specified by order, rounded up to
    2090             :  * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
    2091             :  * of SECTION_SIZE bytes by initializing struct pages in increments of
    2092             :  * PAGES_PER_SECTION * sizeof(struct page) bytes.
    2093             :  *
    2094             :  * Return true when zone was grown, otherwise return false. We return true even
    2095             :  * when we grow less than requested, to let the caller decide if there are
    2096             :  * enough pages to satisfy the allocation.
    2097             :  *
    2098             :  * Note: We use noinline because this function is needed only during boot, and
    2099             :  * it is called from a __ref function _deferred_grow_zone. This way we are
    2100             :  * making sure that it is not inlined into permanent text section.
    2101             :  */
    2102             : static noinline bool __init
    2103             : deferred_grow_zone(struct zone *zone, unsigned int order)
    2104             : {
    2105             :         unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
    2106             :         pg_data_t *pgdat = zone->zone_pgdat;
    2107             :         unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
    2108             :         unsigned long spfn, epfn, flags;
    2109             :         unsigned long nr_pages = 0;
    2110             :         u64 i;
    2111             : 
    2112             :         /* Only the last zone may have deferred pages */
    2113             :         if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
    2114             :                 return false;
    2115             : 
    2116             :         pgdat_resize_lock(pgdat, &flags);
    2117             : 
    2118             :         /*
    2119             :          * If someone grew this zone while we were waiting for spinlock, return
    2120             :          * true, as there might be enough pages already.
    2121             :          */
    2122             :         if (first_deferred_pfn != pgdat->first_deferred_pfn) {
    2123             :                 pgdat_resize_unlock(pgdat, &flags);
    2124             :                 return true;
    2125             :         }
    2126             : 
    2127             :         /* If the zone is empty somebody else may have cleared out the zone */
    2128             :         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2129             :                                                  first_deferred_pfn)) {
    2130             :                 pgdat->first_deferred_pfn = ULONG_MAX;
    2131             :                 pgdat_resize_unlock(pgdat, &flags);
    2132             :                 /* Retry only once. */
    2133             :                 return first_deferred_pfn != ULONG_MAX;
    2134             :         }
    2135             : 
    2136             :         /*
    2137             :          * Initialize and free pages in MAX_ORDER sized increments so
    2138             :          * that we can avoid introducing any issues with the buddy
    2139             :          * allocator.
    2140             :          */
    2141             :         while (spfn < epfn) {
    2142             :                 /* update our first deferred PFN for this section */
    2143             :                 first_deferred_pfn = spfn;
    2144             : 
    2145             :                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
    2146             :                 touch_nmi_watchdog();
    2147             : 
    2148             :                 /* We should only stop along section boundaries */
    2149             :                 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
    2150             :                         continue;
    2151             : 
    2152             :                 /* If our quota has been met we can stop here */
    2153             :                 if (nr_pages >= nr_pages_needed)
    2154             :                         break;
    2155             :         }
    2156             : 
    2157             :         pgdat->first_deferred_pfn = spfn;
    2158             :         pgdat_resize_unlock(pgdat, &flags);
    2159             : 
    2160             :         return nr_pages > 0;
    2161             : }
    2162             : 
    2163             : /*
    2164             :  * deferred_grow_zone() is __init, but it is called from
    2165             :  * get_page_from_freelist() during early boot until deferred_pages permanently
    2166             :  * disables this call. This is why we have refdata wrapper to avoid warning,
    2167             :  * and to ensure that the function body gets unloaded.
    2168             :  */
    2169             : static bool __ref
    2170             : _deferred_grow_zone(struct zone *zone, unsigned int order)
    2171             : {
    2172             :         return deferred_grow_zone(zone, order);
    2173             : }
    2174             : 
    2175             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
    2176             : 
    2177           1 : void __init page_alloc_init_late(void)
    2178             : {
    2179             :         struct zone *zone;
    2180             :         int nid;
    2181             : 
    2182             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    2183             : 
    2184             :         /* There will be num_node_state(N_MEMORY) threads */
    2185             :         atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
    2186             :         for_each_node_state(nid, N_MEMORY) {
    2187             :                 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
    2188             :         }
    2189             : 
    2190             :         /* Block until all are initialised */
    2191             :         wait_for_completion(&pgdat_init_all_done_comp);
    2192             : 
    2193             :         /*
    2194             :          * We initialized the rest of the deferred pages.  Permanently disable
    2195             :          * on-demand struct page initialization.
    2196             :          */
    2197             :         static_branch_disable(&deferred_pages);
    2198             : 
    2199             :         /* Reinit limits that are based on free pages after the kernel is up */
    2200             :         files_maxfiles_init();
    2201             : #endif
    2202             : 
    2203           1 :         buffer_init();
    2204             : 
    2205             :         /* Discard memblock private memory */
    2206           1 :         memblock_discard();
    2207             : 
    2208           1 :         for_each_node_state(nid, N_MEMORY)
    2209             :                 shuffle_free_memory(NODE_DATA(nid));
    2210             : 
    2211           3 :         for_each_populated_zone(zone)
    2212           1 :                 set_zone_contiguous(zone);
    2213           1 : }
    2214             : 
    2215             : #ifdef CONFIG_CMA
    2216             : /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
    2217             : void __init init_cma_reserved_pageblock(struct page *page)
    2218             : {
    2219             :         unsigned i = pageblock_nr_pages;
    2220             :         struct page *p = page;
    2221             : 
    2222             :         do {
    2223             :                 __ClearPageReserved(p);
    2224             :                 set_page_count(p, 0);
    2225             :         } while (++p, --i);
    2226             : 
    2227             :         set_pageblock_migratetype(page, MIGRATE_CMA);
    2228             :         set_page_refcounted(page);
    2229             :         __free_pages(page, pageblock_order);
    2230             : 
    2231             :         adjust_managed_page_count(page, pageblock_nr_pages);
    2232             :         page_zone(page)->cma_pages += pageblock_nr_pages;
    2233             : }
    2234             : #endif
    2235             : 
    2236             : /*
    2237             :  * The order of subdivision here is critical for the IO subsystem.
    2238             :  * Please do not alter this order without good reasons and regression
    2239             :  * testing. Specifically, as large blocks of memory are subdivided,
    2240             :  * the order in which smaller blocks are delivered depends on the order
    2241             :  * they're subdivided in this function. This is the primary factor
    2242             :  * influencing the order in which pages are delivered to the IO
    2243             :  * subsystem according to empirical testing, and this is also justified
    2244             :  * by considering the behavior of a buddy system containing a single
    2245             :  * large block of memory acted on by a series of small allocations.
    2246             :  * This behavior is a critical factor in sglist merging's success.
    2247             :  *
    2248             :  * -- nyc
    2249             :  */
    2250             : static inline void expand(struct zone *zone, struct page *page,
    2251             :         int low, int high, int migratetype)
    2252             : {
    2253         691 :         unsigned long size = 1 << high;
    2254             : 
    2255        1390 :         while (high > low) {
    2256         699 :                 high--;
    2257         699 :                 size >>= 1;
    2258             :                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
    2259             : 
    2260             :                 /*
    2261             :                  * Mark as guard pages (or page), that will allow to
    2262             :                  * merge back to allocator when buddy will be freed.
    2263             :                  * Corresponding page table entries will not be touched,
    2264             :                  * pages will stay not present in virtual address space
    2265             :                  */
    2266         699 :                 if (set_page_guard(zone, &page[size], high, migratetype))
    2267             :                         continue;
    2268             : 
    2269        1398 :                 add_to_free_list(&page[size], zone, high, migratetype);
    2270         699 :                 set_buddy_order(&page[size], high);
    2271             :         }
    2272             : }
    2273             : 
    2274           0 : static void check_new_page_bad(struct page *page)
    2275             : {
    2276             :         if (unlikely(page->flags & __PG_HWPOISON)) {
    2277             :                 /* Don't complain about hwpoisoned pages */
    2278             :                 page_mapcount_reset(page); /* remove PageBuddy */
    2279             :                 return;
    2280             :         }
    2281             : 
    2282           0 :         bad_page(page,
    2283             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
    2284             : }
    2285             : 
    2286             : /*
    2287             :  * This page is about to be returned from the page allocator
    2288             :  */
    2289        1253 : static inline int check_new_page(struct page *page)
    2290             : {
    2291        1253 :         if (likely(page_expected_state(page,
    2292             :                                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
    2293             :                 return 0;
    2294             : 
    2295           0 :         check_new_page_bad(page);
    2296           0 :         return 1;
    2297             : }
    2298             : 
    2299             : static bool check_new_pages(struct page *page, unsigned int order)
    2300             : {
    2301             :         int i;
    2302        1253 :         for (i = 0; i < (1 << order); i++) {
    2303        1253 :                 struct page *p = page + i;
    2304             : 
    2305        1253 :                 if (unlikely(check_new_page(p)))
    2306             :                         return true;
    2307             :         }
    2308             : 
    2309             :         return false;
    2310             : }
    2311             : 
    2312             : #ifdef CONFIG_DEBUG_VM
    2313             : /*
    2314             :  * With DEBUG_VM enabled, order-0 pages are checked for expected state when
    2315             :  * being allocated from pcp lists. With debug_pagealloc also enabled, they are
    2316             :  * also checked when pcp lists are refilled from the free lists.
    2317             :  */
    2318             : static inline bool check_pcp_refill(struct page *page, unsigned int order)
    2319             : {
    2320             :         if (debug_pagealloc_enabled_static())
    2321             :                 return check_new_pages(page, order);
    2322             :         else
    2323             :                 return false;
    2324             : }
    2325             : 
    2326             : static inline bool check_new_pcp(struct page *page, unsigned int order)
    2327             : {
    2328             :         return check_new_pages(page, order);
    2329             : }
    2330             : #else
    2331             : /*
    2332             :  * With DEBUG_VM disabled, free order-0 pages are checked for expected state
    2333             :  * when pcp lists are being refilled from the free lists. With debug_pagealloc
    2334             :  * enabled, they are also checked when being allocated from the pcp lists.
    2335             :  */
    2336             : static inline bool check_pcp_refill(struct page *page, unsigned int order)
    2337             : {
    2338         683 :         return check_new_pages(page, order);
    2339             : }
    2340             : static inline bool check_new_pcp(struct page *page, unsigned int order)
    2341             : {
    2342             :         if (debug_pagealloc_enabled_static())
    2343             :                 return check_new_pages(page, order);
    2344             :         else
    2345             :                 return false;
    2346             : }
    2347             : #endif /* CONFIG_DEBUG_VM */
    2348             : 
    2349             : static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
    2350             : {
    2351             :         /* Don't skip if a software KASAN mode is enabled. */
    2352             :         if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
    2353             :             IS_ENABLED(CONFIG_KASAN_SW_TAGS))
    2354             :                 return false;
    2355             : 
    2356             :         /* Skip, if hardware tag-based KASAN is not enabled. */
    2357             :         if (!kasan_hw_tags_enabled())
    2358             :                 return true;
    2359             : 
    2360             :         /*
    2361             :          * With hardware tag-based KASAN enabled, skip if either:
    2362             :          *
    2363             :          * 1. Memory tags have already been cleared via tag_clear_highpage().
    2364             :          * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
    2365             :          */
    2366             :         return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
    2367             : }
    2368             : 
    2369             : static inline bool should_skip_init(gfp_t flags)
    2370             : {
    2371             :         /* Don't skip, if hardware tag-based KASAN is not enabled. */
    2372             :         if (!kasan_hw_tags_enabled())
    2373             :                 return false;
    2374             : 
    2375             :         /* For hardware tag-based KASAN, skip if requested. */
    2376             :         return (flags & __GFP_SKIP_ZERO);
    2377             : }
    2378             : 
    2379         528 : inline void post_alloc_hook(struct page *page, unsigned int order,
    2380             :                                 gfp_t gfp_flags)
    2381             : {
    2382        1056 :         bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
    2383             :                         !should_skip_init(gfp_flags);
    2384         528 :         bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
    2385             : 
    2386        1056 :         set_page_private(page, 0);
    2387         528 :         set_page_refcounted(page);
    2388             : 
    2389         528 :         arch_alloc_page(page, order);
    2390         528 :         debug_pagealloc_map_pages(page, 1 << order);
    2391             : 
    2392             :         /*
    2393             :          * Page unpoisoning must happen before memory initialization.
    2394             :          * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
    2395             :          * allocations and the page unpoisoning code will complain.
    2396             :          */
    2397         528 :         kernel_unpoison_pages(page, 1 << order);
    2398             : 
    2399             :         /*
    2400             :          * As memory initialization might be integrated into KASAN,
    2401             :          * KASAN unpoisoning and memory initializion code must be
    2402             :          * kept together to avoid discrepancies in behavior.
    2403             :          */
    2404             : 
    2405             :         /*
    2406             :          * If memory tags should be zeroed (which happens only when memory
    2407             :          * should be initialized as well).
    2408             :          */
    2409         528 :         if (init_tags) {
    2410             :                 int i;
    2411             : 
    2412             :                 /* Initialize both memory and tags. */
    2413             :                 for (i = 0; i != 1 << order; ++i)
    2414             :                         tag_clear_highpage(page + i);
    2415             : 
    2416             :                 /* Note that memory is already initialized by the loop above. */
    2417             :                 init = false;
    2418             :         }
    2419         528 :         if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
    2420             :                 /* Unpoison shadow memory or set memory tags. */
    2421             :                 kasan_unpoison_pages(page, order, init);
    2422             : 
    2423             :                 /* Note that memory is already initialized by KASAN. */
    2424             :                 if (kasan_has_integrated_init())
    2425             :                         init = false;
    2426             :         }
    2427             :         /* If memory is still not initialized, do it now. */
    2428         528 :         if (init)
    2429             :                 kernel_init_free_pages(page, 1 << order);
    2430             :         /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
    2431             :         if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
    2432             :                 SetPageSkipKASanPoison(page);
    2433             : 
    2434         528 :         set_page_owner(page, order, gfp_flags);
    2435         528 :         page_table_check_alloc(page, order);
    2436         528 : }
    2437             : 
    2438         468 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
    2439             :                                                         unsigned int alloc_flags)
    2440             : {
    2441         528 :         post_alloc_hook(page, order, gfp_flags);
    2442             : 
    2443         468 :         if (order && (gfp_flags & __GFP_COMP))
    2444             :                 prep_compound_page(page, order);
    2445             : 
    2446             :         /*
    2447             :          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
    2448             :          * allocate the page. The expectation is that the caller is taking
    2449             :          * steps that will free more memory. The caller should avoid the page
    2450             :          * being used for !PFMEMALLOC purposes.
    2451             :          */
    2452         468 :         if (alloc_flags & ALLOC_NO_WATERMARKS)
    2453           0 :                 set_page_pfmemalloc(page);
    2454             :         else
    2455         528 :                 clear_page_pfmemalloc(page);
    2456         468 : }
    2457             : 
    2458             : /*
    2459             :  * Go through the free lists for the given migratetype and remove
    2460             :  * the smallest available page from the freelists
    2461             :  */
    2462             : static __always_inline
    2463             : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    2464             :                                                 int migratetype)
    2465             : {
    2466             :         unsigned int current_order;
    2467             :         struct free_area *area;
    2468             :         struct page *page;
    2469             : 
    2470             :         /* Find a page of the appropriate size in the preferred list */
    2471        2826 :         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
    2472        1411 :                 area = &(zone->free_area[current_order]);
    2473        1411 :                 page = get_page_from_free_area(area, migratetype);
    2474        1411 :                 if (!page)
    2475         720 :                         continue;
    2476         691 :                 del_page_from_free_list(page, zone, current_order);
    2477        1382 :                 expand(zone, page, order, current_order, migratetype);
    2478         691 :                 set_pcppage_migratetype(page, migratetype);
    2479             :                 return page;
    2480             :         }
    2481             : 
    2482             :         return NULL;
    2483             : }
    2484             : 
    2485             : 
    2486             : /*
    2487             :  * This array describes the order lists are fallen back to when
    2488             :  * the free lists for the desirable migrate type are depleted
    2489             :  *
    2490             :  * The other migratetypes do not have fallbacks.
    2491             :  */
    2492             : static int fallbacks[MIGRATE_TYPES][3] = {
    2493             :         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
    2494             :         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
    2495             :         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
    2496             : };
    2497             : 
    2498             : #ifdef CONFIG_CMA
    2499             : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    2500             :                                         unsigned int order)
    2501             : {
    2502             :         return __rmqueue_smallest(zone, order, MIGRATE_CMA);
    2503             : }
    2504             : #else
    2505             : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    2506             :                                         unsigned int order) { return NULL; }
    2507             : #endif
    2508             : 
    2509             : /*
    2510             :  * Move the free pages in a range to the freelist tail of the requested type.
    2511             :  * Note that start_page and end_pages are not aligned on a pageblock
    2512             :  * boundary. If alignment is required, use move_freepages_block()
    2513             :  */
    2514           0 : static int move_freepages(struct zone *zone,
    2515             :                           unsigned long start_pfn, unsigned long end_pfn,
    2516             :                           int migratetype, int *num_movable)
    2517             : {
    2518             :         struct page *page;
    2519             :         unsigned long pfn;
    2520             :         unsigned int order;
    2521           0 :         int pages_moved = 0;
    2522             : 
    2523           0 :         for (pfn = start_pfn; pfn <= end_pfn;) {
    2524           0 :                 page = pfn_to_page(pfn);
    2525           0 :                 if (!PageBuddy(page)) {
    2526             :                         /*
    2527             :                          * We assume that pages that could be isolated for
    2528             :                          * migration are movable. But we don't actually try
    2529             :                          * isolating, as that would be expensive.
    2530             :                          */
    2531           0 :                         if (num_movable &&
    2532           0 :                                         (PageLRU(page) || __PageMovable(page)))
    2533           0 :                                 (*num_movable)++;
    2534           0 :                         pfn++;
    2535           0 :                         continue;
    2536             :                 }
    2537             : 
    2538             :                 /* Make sure we are not inadvertently changing nodes */
    2539             :                 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
    2540             :                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
    2541             : 
    2542           0 :                 order = buddy_order(page);
    2543           0 :                 move_to_free_list(page, zone, order, migratetype);
    2544           0 :                 pfn += 1 << order;
    2545           0 :                 pages_moved += 1 << order;
    2546             :         }
    2547             : 
    2548           0 :         return pages_moved;
    2549             : }
    2550             : 
    2551           0 : int move_freepages_block(struct zone *zone, struct page *page,
    2552             :                                 int migratetype, int *num_movable)
    2553             : {
    2554             :         unsigned long start_pfn, end_pfn, pfn;
    2555             : 
    2556           0 :         if (num_movable)
    2557           0 :                 *num_movable = 0;
    2558             : 
    2559           0 :         pfn = page_to_pfn(page);
    2560           0 :         start_pfn = pfn & ~(pageblock_nr_pages - 1);
    2561           0 :         end_pfn = start_pfn + pageblock_nr_pages - 1;
    2562             : 
    2563             :         /* Do not cross zone boundaries */
    2564           0 :         if (!zone_spans_pfn(zone, start_pfn))
    2565           0 :                 start_pfn = pfn;
    2566           0 :         if (!zone_spans_pfn(zone, end_pfn))
    2567             :                 return 0;
    2568             : 
    2569           0 :         return move_freepages(zone, start_pfn, end_pfn, migratetype,
    2570             :                                                                 num_movable);
    2571             : }
    2572             : 
    2573             : static void change_pageblock_range(struct page *pageblock_page,
    2574             :                                         int start_order, int migratetype)
    2575             : {
    2576           2 :         int nr_pageblocks = 1 << (start_order - pageblock_order);
    2577             : 
    2578           4 :         while (nr_pageblocks--) {
    2579           2 :                 set_pageblock_migratetype(pageblock_page, migratetype);
    2580           2 :                 pageblock_page += pageblock_nr_pages;
    2581             :         }
    2582             : }
    2583             : 
    2584             : /*
    2585             :  * When we are falling back to another migratetype during allocation, try to
    2586             :  * steal extra free pages from the same pageblocks to satisfy further
    2587             :  * allocations, instead of polluting multiple pageblocks.
    2588             :  *
    2589             :  * If we are stealing a relatively large buddy page, it is likely there will
    2590             :  * be more free pages in the pageblock, so try to steal them all. For
    2591             :  * reclaimable and unmovable allocations, we steal regardless of page size,
    2592             :  * as fragmentation caused by those allocations polluting movable pageblocks
    2593             :  * is worse than movable allocations stealing from unmovable and reclaimable
    2594             :  * pageblocks.
    2595             :  */
    2596             : static bool can_steal_fallback(unsigned int order, int start_mt)
    2597             : {
    2598             :         /*
    2599             :          * Leaving this order check is intended, although there is
    2600             :          * relaxed order check in next check. The reason is that
    2601             :          * we can actually steal whole pageblock if this condition met,
    2602             :          * but, below check doesn't guarantee it and that is just heuristic
    2603             :          * so could be changed anytime.
    2604             :          */
    2605           2 :         if (order >= pageblock_order)
    2606             :                 return true;
    2607             : 
    2608           0 :         if (order >= pageblock_order / 2 ||
    2609           0 :                 start_mt == MIGRATE_RECLAIMABLE ||
    2610           0 :                 start_mt == MIGRATE_UNMOVABLE ||
    2611             :                 page_group_by_mobility_disabled)
    2612             :                 return true;
    2613             : 
    2614             :         return false;
    2615             : }
    2616             : 
    2617           0 : static inline bool boost_watermark(struct zone *zone)
    2618             : {
    2619             :         unsigned long max_boost;
    2620             : 
    2621           0 :         if (!watermark_boost_factor)
    2622             :                 return false;
    2623             :         /*
    2624             :          * Don't bother in zones that are unlikely to produce results.
    2625             :          * On small machines, including kdump capture kernels running
    2626             :          * in a small area, boosting the watermark can cause an out of
    2627             :          * memory situation immediately.
    2628             :          */
    2629           0 :         if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
    2630             :                 return false;
    2631             : 
    2632           0 :         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
    2633             :                         watermark_boost_factor, 10000);
    2634             : 
    2635             :         /*
    2636             :          * high watermark may be uninitialised if fragmentation occurs
    2637             :          * very early in boot so do not boost. We do not fall
    2638             :          * through and boost by pageblock_nr_pages as failing
    2639             :          * allocations that early means that reclaim is not going
    2640             :          * to help and it may even be impossible to reclaim the
    2641             :          * boosted watermark resulting in a hang.
    2642             :          */
    2643           0 :         if (!max_boost)
    2644             :                 return false;
    2645             : 
    2646           0 :         max_boost = max(pageblock_nr_pages, max_boost);
    2647             : 
    2648           0 :         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
    2649             :                 max_boost);
    2650             : 
    2651           0 :         return true;
    2652             : }
    2653             : 
    2654             : /*
    2655             :  * This function implements actual steal behaviour. If order is large enough,
    2656             :  * we can steal whole pageblock. If not, we first move freepages in this
    2657             :  * pageblock to our migratetype and determine how many already-allocated pages
    2658             :  * are there in the pageblock with a compatible migratetype. If at least half
    2659             :  * of pages are free or compatible, we can change migratetype of the pageblock
    2660             :  * itself, so pages freed in the future will be put on the correct free list.
    2661             :  */
    2662           2 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
    2663             :                 unsigned int alloc_flags, int start_type, bool whole_block)
    2664             : {
    2665           4 :         unsigned int current_order = buddy_order(page);
    2666             :         int free_pages, movable_pages, alike_pages;
    2667             :         int old_block_type;
    2668             : 
    2669           4 :         old_block_type = get_pageblock_migratetype(page);
    2670             : 
    2671             :         /*
    2672             :          * This can happen due to races and we want to prevent broken
    2673             :          * highatomic accounting.
    2674             :          */
    2675           2 :         if (is_migrate_highatomic(old_block_type))
    2676             :                 goto single_page;
    2677             : 
    2678             :         /* Take ownership for orders >= pageblock_order */
    2679           2 :         if (current_order >= pageblock_order) {
    2680           2 :                 change_pageblock_range(page, current_order, start_type);
    2681             :                 goto single_page;
    2682             :         }
    2683             : 
    2684             :         /*
    2685             :          * Boost watermarks to increase reclaim pressure to reduce the
    2686             :          * likelihood of future fallbacks. Wake kswapd now as the node
    2687             :          * may be balanced overall and kswapd will not wake naturally.
    2688             :          */
    2689           0 :         if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
    2690           0 :                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    2691             : 
    2692             :         /* We are not allowed to try stealing from the whole block */
    2693           0 :         if (!whole_block)
    2694             :                 goto single_page;
    2695             : 
    2696           0 :         free_pages = move_freepages_block(zone, page, start_type,
    2697             :                                                 &movable_pages);
    2698             :         /*
    2699             :          * Determine how many pages are compatible with our allocation.
    2700             :          * For movable allocation, it's the number of movable pages which
    2701             :          * we just obtained. For other types it's a bit more tricky.
    2702             :          */
    2703           0 :         if (start_type == MIGRATE_MOVABLE) {
    2704           0 :                 alike_pages = movable_pages;
    2705             :         } else {
    2706             :                 /*
    2707             :                  * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
    2708             :                  * to MOVABLE pageblock, consider all non-movable pages as
    2709             :                  * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
    2710             :                  * vice versa, be conservative since we can't distinguish the
    2711             :                  * exact migratetype of non-movable pages.
    2712             :                  */
    2713           0 :                 if (old_block_type == MIGRATE_MOVABLE)
    2714           0 :                         alike_pages = pageblock_nr_pages
    2715           0 :                                                 - (free_pages + movable_pages);
    2716             :                 else
    2717             :                         alike_pages = 0;
    2718             :         }
    2719             : 
    2720             :         /* moving whole block can fail due to zone boundary conditions */
    2721           0 :         if (!free_pages)
    2722             :                 goto single_page;
    2723             : 
    2724             :         /*
    2725             :          * If a sufficient number of pages in the block are either free or of
    2726             :          * comparable migratability as our allocation, claim the whole block.
    2727             :          */
    2728           0 :         if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
    2729             :                         page_group_by_mobility_disabled)
    2730           0 :                 set_pageblock_migratetype(page, start_type);
    2731             : 
    2732           0 :         return;
    2733             : 
    2734             : single_page:
    2735           2 :         move_to_free_list(page, zone, current_order, start_type);
    2736             : }
    2737             : 
    2738             : /*
    2739             :  * Check whether there is a suitable fallback freepage with requested order.
    2740             :  * If only_stealable is true, this function returns fallback_mt only if
    2741             :  * we can steal other freepages all together. This would help to reduce
    2742             :  * fragmentation due to mixed migratetype pages in one pageblock.
    2743             :  */
    2744           2 : int find_suitable_fallback(struct free_area *area, unsigned int order,
    2745             :                         int migratetype, bool only_stealable, bool *can_steal)
    2746             : {
    2747             :         int i;
    2748             :         int fallback_mt;
    2749             : 
    2750           2 :         if (area->nr_free == 0)
    2751             :                 return -1;
    2752             : 
    2753           2 :         *can_steal = false;
    2754           4 :         for (i = 0;; i++) {
    2755           6 :                 fallback_mt = fallbacks[migratetype][i];
    2756           4 :                 if (fallback_mt == MIGRATE_TYPES)
    2757             :                         break;
    2758             : 
    2759           4 :                 if (free_area_empty(area, fallback_mt))
    2760           2 :                         continue;
    2761             : 
    2762           2 :                 if (can_steal_fallback(order, migratetype))
    2763           2 :                         *can_steal = true;
    2764             : 
    2765           2 :                 if (!only_stealable)
    2766             :                         return fallback_mt;
    2767             : 
    2768           0 :                 if (*can_steal)
    2769             :                         return fallback_mt;
    2770             :         }
    2771             : 
    2772             :         return -1;
    2773             : }
    2774             : 
    2775             : /*
    2776             :  * Reserve a pageblock for exclusive use of high-order atomic allocations if
    2777             :  * there are no empty page blocks that contain a page with a suitable order
    2778             :  */
    2779           0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
    2780             :                                 unsigned int alloc_order)
    2781             : {
    2782             :         int mt;
    2783             :         unsigned long max_managed, flags;
    2784             : 
    2785             :         /*
    2786             :          * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
    2787             :          * Check is race-prone but harmless.
    2788             :          */
    2789           0 :         max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
    2790           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2791             :                 return;
    2792             : 
    2793           0 :         spin_lock_irqsave(&zone->lock, flags);
    2794             : 
    2795             :         /* Recheck the nr_reserved_highatomic limit under the lock */
    2796           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2797             :                 goto out_unlock;
    2798             : 
    2799             :         /* Yoink! */
    2800           0 :         mt = get_pageblock_migratetype(page);
    2801             :         /* Only reserve normal pageblocks (i.e., they can merge with others) */
    2802           0 :         if (migratetype_is_mergeable(mt)) {
    2803           0 :                 zone->nr_reserved_highatomic += pageblock_nr_pages;
    2804           0 :                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
    2805           0 :                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
    2806             :         }
    2807             : 
    2808             : out_unlock:
    2809           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    2810             : }
    2811             : 
    2812             : /*
    2813             :  * Used when an allocation is about to fail under memory pressure. This
    2814             :  * potentially hurts the reliability of high-order allocations when under
    2815             :  * intense memory pressure but failed atomic allocations should be easier
    2816             :  * to recover from than an OOM.
    2817             :  *
    2818             :  * If @force is true, try to unreserve a pageblock even though highatomic
    2819             :  * pageblock is exhausted.
    2820             :  */
    2821           0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
    2822             :                                                 bool force)
    2823             : {
    2824           0 :         struct zonelist *zonelist = ac->zonelist;
    2825             :         unsigned long flags;
    2826             :         struct zoneref *z;
    2827             :         struct zone *zone;
    2828             :         struct page *page;
    2829             :         int order;
    2830             :         bool ret;
    2831             : 
    2832           0 :         for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
    2833             :                                                                 ac->nodemask) {
    2834             :                 /*
    2835             :                  * Preserve at least one pageblock unless memory pressure
    2836             :                  * is really high.
    2837             :                  */
    2838           0 :                 if (!force && zone->nr_reserved_highatomic <=
    2839             :                                         pageblock_nr_pages)
    2840           0 :                         continue;
    2841             : 
    2842           0 :                 spin_lock_irqsave(&zone->lock, flags);
    2843           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    2844           0 :                         struct free_area *area = &(zone->free_area[order]);
    2845             : 
    2846           0 :                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
    2847           0 :                         if (!page)
    2848           0 :                                 continue;
    2849             : 
    2850             :                         /*
    2851             :                          * In page freeing path, migratetype change is racy so
    2852             :                          * we can counter several free pages in a pageblock
    2853             :                          * in this loop although we changed the pageblock type
    2854             :                          * from highatomic to ac->migratetype. So we should
    2855             :                          * adjust the count once.
    2856             :                          */
    2857           0 :                         if (is_migrate_highatomic_page(page)) {
    2858             :                                 /*
    2859             :                                  * It should never happen but changes to
    2860             :                                  * locking could inadvertently allow a per-cpu
    2861             :                                  * drain to add pages to MIGRATE_HIGHATOMIC
    2862             :                                  * while unreserving so be safe and watch for
    2863             :                                  * underflows.
    2864             :                                  */
    2865           0 :                                 zone->nr_reserved_highatomic -= min(
    2866             :                                                 pageblock_nr_pages,
    2867             :                                                 zone->nr_reserved_highatomic);
    2868             :                         }
    2869             : 
    2870             :                         /*
    2871             :                          * Convert to ac->migratetype and avoid the normal
    2872             :                          * pageblock stealing heuristics. Minimally, the caller
    2873             :                          * is doing the work and needs the pages. More
    2874             :                          * importantly, if the block was always converted to
    2875             :                          * MIGRATE_UNMOVABLE or another type then the number
    2876             :                          * of pageblocks that cannot be completely freed
    2877             :                          * may increase.
    2878             :                          */
    2879           0 :                         set_pageblock_migratetype(page, ac->migratetype);
    2880           0 :                         ret = move_freepages_block(zone, page, ac->migratetype,
    2881             :                                                                         NULL);
    2882           0 :                         if (ret) {
    2883           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2884           0 :                                 return ret;
    2885             :                         }
    2886             :                 }
    2887           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2888             :         }
    2889             : 
    2890             :         return false;
    2891             : }
    2892             : 
    2893             : /*
    2894             :  * Try finding a free buddy page on the fallback list and put it on the free
    2895             :  * list of requested migratetype, possibly along with other pages from the same
    2896             :  * block, depending on fragmentation avoidance heuristics. Returns true if
    2897             :  * fallback was found so that __rmqueue_smallest() can grab it.
    2898             :  *
    2899             :  * The use of signed ints for order and current_order is a deliberate
    2900             :  * deviation from the rest of this file, to make the for loop
    2901             :  * condition simpler.
    2902             :  */
    2903             : static __always_inline bool
    2904             : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
    2905             :                                                 unsigned int alloc_flags)
    2906             : {
    2907             :         struct free_area *area;
    2908             :         int current_order;
    2909           2 :         int min_order = order;
    2910             :         struct page *page;
    2911             :         int fallback_mt;
    2912             :         bool can_steal;
    2913             : 
    2914             :         /*
    2915             :          * Do not steal pages from freelists belonging to other pageblocks
    2916             :          * i.e. orders < pageblock_order. If there are no local zones free,
    2917             :          * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
    2918             :          */
    2919             :         if (alloc_flags & ALLOC_NOFRAGMENT)
    2920             :                 min_order = pageblock_order;
    2921             : 
    2922             :         /*
    2923             :          * Find the largest available free page in the other list. This roughly
    2924             :          * approximates finding the pageblock with the most free pages, which
    2925             :          * would be too costly to do exactly.
    2926             :          */
    2927           4 :         for (current_order = MAX_ORDER - 1; current_order >= min_order;
    2928           0 :                                 --current_order) {
    2929           2 :                 area = &(zone->free_area[current_order]);
    2930           2 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2931             :                                 start_migratetype, false, &can_steal);
    2932           2 :                 if (fallback_mt == -1)
    2933           0 :                         continue;
    2934             : 
    2935             :                 /*
    2936             :                  * We cannot steal all free pages from the pageblock and the
    2937             :                  * requested migratetype is movable. In that case it's better to
    2938             :                  * steal and split the smallest available page instead of the
    2939             :                  * largest available page, because even if the next movable
    2940             :                  * allocation falls back into a different pageblock than this
    2941             :                  * one, it won't cause permanent fragmentation.
    2942             :                  */
    2943           2 :                 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
    2944           0 :                                         && current_order > order)
    2945             :                         goto find_smallest;
    2946             : 
    2947             :                 goto do_steal;
    2948             :         }
    2949             : 
    2950             :         return false;
    2951             : 
    2952             : find_smallest:
    2953           0 :         for (current_order = order; current_order < MAX_ORDER;
    2954           0 :                                                         current_order++) {
    2955           0 :                 area = &(zone->free_area[current_order]);
    2956           0 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2957             :                                 start_migratetype, false, &can_steal);
    2958           0 :                 if (fallback_mt != -1)
    2959             :                         break;
    2960             :         }
    2961             : 
    2962             :         /*
    2963             :          * This should not happen - we already found a suitable fallback
    2964             :          * when looking for the largest page.
    2965             :          */
    2966             :         VM_BUG_ON(current_order == MAX_ORDER);
    2967             : 
    2968             : do_steal:
    2969           2 :         page = get_page_from_free_area(area, fallback_mt);
    2970             : 
    2971           2 :         steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
    2972             :                                                                 can_steal);
    2973             : 
    2974           2 :         trace_mm_page_alloc_extfrag(page, order, current_order,
    2975             :                 start_migratetype, fallback_mt);
    2976             : 
    2977             :         return true;
    2978             : 
    2979             : }
    2980             : 
    2981             : /*
    2982             :  * Do the hard work of removing an element from the buddy allocator.
    2983             :  * Call me with the zone->lock already held.
    2984             :  */
    2985             : static __always_inline struct page *
    2986             : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
    2987             :                                                 unsigned int alloc_flags)
    2988             : {
    2989             :         struct page *page;
    2990             : 
    2991             :         if (IS_ENABLED(CONFIG_CMA)) {
    2992             :                 /*
    2993             :                  * Balance movable allocations between regular and CMA areas by
    2994             :                  * allocating from CMA when over half of the zone's free memory
    2995             :                  * is in the CMA area.
    2996             :                  */
    2997             :                 if (alloc_flags & ALLOC_CMA &&
    2998             :                     zone_page_state(zone, NR_FREE_CMA_PAGES) >
    2999             :                     zone_page_state(zone, NR_FREE_PAGES) / 2) {
    3000             :                         page = __rmqueue_cma_fallback(zone, order);
    3001             :                         if (page)
    3002             :                                 goto out;
    3003             :                 }
    3004             :         }
    3005             : retry:
    3006         693 :         page = __rmqueue_smallest(zone, order, migratetype);
    3007         693 :         if (unlikely(!page)) {
    3008           2 :                 if (alloc_flags & ALLOC_CMA)
    3009           0 :                         page = __rmqueue_cma_fallback(zone, order);
    3010             : 
    3011           4 :                 if (!page && __rmqueue_fallback(zone, order, migratetype,
    3012             :                                                                 alloc_flags))
    3013             :                         goto retry;
    3014             :         }
    3015             : out:
    3016             :         if (page)
    3017             :                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
    3018             :         return page;
    3019             : }
    3020             : 
    3021             : /*
    3022             :  * Obtain a specified number of elements from the buddy allocator, all under
    3023             :  * a single hold of the lock, for efficiency.  Add them to the supplied list.
    3024             :  * Returns the number of new pages which were placed at *list.
    3025             :  */
    3026          27 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
    3027             :                         unsigned long count, struct list_head *list,
    3028             :                         int migratetype, unsigned int alloc_flags)
    3029             : {
    3030          27 :         int i, allocated = 0;
    3031             : 
    3032             :         /*
    3033             :          * local_lock_irq held so equivalent to spin_lock_irqsave for
    3034             :          * both PREEMPT_RT and non-PREEMPT_RT configurations.
    3035             :          */
    3036          54 :         spin_lock(&zone->lock);
    3037         710 :         for (i = 0; i < count; ++i) {
    3038         683 :                 struct page *page = __rmqueue(zone, order, migratetype,
    3039             :                                                                 alloc_flags);
    3040         683 :                 if (unlikely(page == NULL))
    3041             :                         break;
    3042             : 
    3043         683 :                 if (unlikely(check_pcp_refill(page, order)))
    3044           0 :                         continue;
    3045             : 
    3046             :                 /*
    3047             :                  * Split buddy pages returned by expand() are received here in
    3048             :                  * physical page order. The page is added to the tail of
    3049             :                  * caller's list. From the callers perspective, the linked list
    3050             :                  * is ordered by page number under some conditions. This is
    3051             :                  * useful for IO devices that can forward direction from the
    3052             :                  * head, thus also in the physical page order. This is useful
    3053             :                  * for IO devices that can merge IO requests if the physical
    3054             :                  * pages are ordered properly.
    3055             :                  */
    3056        1366 :                 list_add_tail(&page->lru, list);
    3057         683 :                 allocated++;
    3058             :                 if (is_migrate_cma(get_pcppage_migratetype(page)))
    3059             :                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
    3060             :                                               -(1 << order));
    3061             :         }
    3062             : 
    3063             :         /*
    3064             :          * i pages were removed from the buddy list even if some leak due
    3065             :          * to check_pcp_refill failing so adjust NR_FREE_PAGES based
    3066             :          * on i. Do not confuse with 'allocated' which is the number of
    3067             :          * pages added to the pcp list.
    3068             :          */
    3069          54 :         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    3070          54 :         spin_unlock(&zone->lock);
    3071          27 :         return allocated;
    3072             : }
    3073             : 
    3074             : #ifdef CONFIG_NUMA
    3075             : /*
    3076             :  * Called from the vmstat counter updater to drain pagesets of this
    3077             :  * currently executing processor on remote nodes after they have
    3078             :  * expired.
    3079             :  *
    3080             :  * Note that this function must be called with the thread pinned to
    3081             :  * a single processor.
    3082             :  */
    3083             : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
    3084             : {
    3085             :         unsigned long flags;
    3086             :         int to_drain, batch;
    3087             : 
    3088             :         local_lock_irqsave(&pagesets.lock, flags);
    3089             :         batch = READ_ONCE(pcp->batch);
    3090             :         to_drain = min(pcp->count, batch);
    3091             :         if (to_drain > 0)
    3092             :                 free_pcppages_bulk(zone, to_drain, pcp, 0);
    3093             :         local_unlock_irqrestore(&pagesets.lock, flags);
    3094             : }
    3095             : #endif
    3096             : 
    3097             : /*
    3098             :  * Drain pcplists of the indicated processor and zone.
    3099             :  *
    3100             :  * The processor must either be the current processor and the
    3101             :  * thread pinned to the current processor or a processor that
    3102             :  * is not online.
    3103             :  */
    3104           0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
    3105             : {
    3106             :         unsigned long flags;
    3107             :         struct per_cpu_pages *pcp;
    3108             : 
    3109           0 :         local_lock_irqsave(&pagesets.lock, flags);
    3110             : 
    3111           0 :         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    3112           0 :         if (pcp->count)
    3113           0 :                 free_pcppages_bulk(zone, pcp->count, pcp, 0);
    3114             : 
    3115           0 :         local_unlock_irqrestore(&pagesets.lock, flags);
    3116           0 : }
    3117             : 
    3118             : /*
    3119             :  * Drain pcplists of all zones on the indicated processor.
    3120             :  *
    3121             :  * The processor must either be the current processor and the
    3122             :  * thread pinned to the current processor or a processor that
    3123             :  * is not online.
    3124             :  */
    3125           0 : static void drain_pages(unsigned int cpu)
    3126             : {
    3127             :         struct zone *zone;
    3128             : 
    3129           0 :         for_each_populated_zone(zone) {
    3130           0 :                 drain_pages_zone(cpu, zone);
    3131             :         }
    3132           0 : }
    3133             : 
    3134             : /*
    3135             :  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
    3136             :  *
    3137             :  * The CPU has to be pinned. When zone parameter is non-NULL, spill just
    3138             :  * the single zone's pages.
    3139             :  */
    3140           0 : void drain_local_pages(struct zone *zone)
    3141             : {
    3142           0 :         int cpu = smp_processor_id();
    3143             : 
    3144           0 :         if (zone)
    3145           0 :                 drain_pages_zone(cpu, zone);
    3146             :         else
    3147           0 :                 drain_pages(cpu);
    3148           0 : }
    3149             : 
    3150           0 : static void drain_local_pages_wq(struct work_struct *work)
    3151             : {
    3152             :         struct pcpu_drain *drain;
    3153             : 
    3154           0 :         drain = container_of(work, struct pcpu_drain, work);
    3155             : 
    3156             :         /*
    3157             :          * drain_all_pages doesn't use proper cpu hotplug protection so
    3158             :          * we can race with cpu offline when the WQ can move this from
    3159             :          * a cpu pinned worker to an unbound one. We can operate on a different
    3160             :          * cpu which is alright but we also have to make sure to not move to
    3161             :          * a different one.
    3162             :          */
    3163             :         migrate_disable();
    3164           0 :         drain_local_pages(drain->zone);
    3165             :         migrate_enable();
    3166           0 : }
    3167             : 
    3168             : /*
    3169             :  * The implementation of drain_all_pages(), exposing an extra parameter to
    3170             :  * drain on all cpus.
    3171             :  *
    3172             :  * drain_all_pages() is optimized to only execute on cpus where pcplists are
    3173             :  * not empty. The check for non-emptiness can however race with a free to
    3174             :  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
    3175             :  * that need the guarantee that every CPU has drained can disable the
    3176             :  * optimizing racy check.
    3177             :  */
    3178           0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
    3179             : {
    3180             :         int cpu;
    3181             : 
    3182             :         /*
    3183             :          * Allocate in the BSS so we won't require allocation in
    3184             :          * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
    3185             :          */
    3186             :         static cpumask_t cpus_with_pcps;
    3187             : 
    3188             :         /*
    3189             :          * Make sure nobody triggers this path before mm_percpu_wq is fully
    3190             :          * initialized.
    3191             :          */
    3192           0 :         if (WARN_ON_ONCE(!mm_percpu_wq))
    3193             :                 return;
    3194             : 
    3195             :         /*
    3196             :          * Do not drain if one is already in progress unless it's specific to
    3197             :          * a zone. Such callers are primarily CMA and memory hotplug and need
    3198             :          * the drain to be complete when the call returns.
    3199             :          */
    3200           0 :         if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
    3201           0 :                 if (!zone)
    3202             :                         return;
    3203           0 :                 mutex_lock(&pcpu_drain_mutex);
    3204             :         }
    3205             : 
    3206             :         /*
    3207             :          * We don't care about racing with CPU hotplug event
    3208             :          * as offline notification will cause the notified
    3209             :          * cpu to drain that CPU pcps and on_each_cpu_mask
    3210             :          * disables preemption as part of its processing
    3211             :          */
    3212           0 :         for_each_online_cpu(cpu) {
    3213             :                 struct per_cpu_pages *pcp;
    3214             :                 struct zone *z;
    3215           0 :                 bool has_pcps = false;
    3216             : 
    3217           0 :                 if (force_all_cpus) {
    3218             :                         /*
    3219             :                          * The pcp.count check is racy, some callers need a
    3220             :                          * guarantee that no cpu is missed.
    3221             :                          */
    3222             :                         has_pcps = true;
    3223           0 :                 } else if (zone) {
    3224           0 :                         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    3225           0 :                         if (pcp->count)
    3226           0 :                                 has_pcps = true;
    3227             :                 } else {
    3228           0 :                         for_each_populated_zone(z) {
    3229           0 :                                 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
    3230           0 :                                 if (pcp->count) {
    3231             :                                         has_pcps = true;
    3232             :                                         break;
    3233             :                                 }
    3234             :                         }
    3235             :                 }
    3236             : 
    3237           0 :                 if (has_pcps)
    3238           0 :                         cpumask_set_cpu(cpu, &cpus_with_pcps);
    3239             :                 else
    3240             :                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
    3241             :         }
    3242             : 
    3243           0 :         for_each_cpu(cpu, &cpus_with_pcps) {
    3244           0 :                 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
    3245             : 
    3246           0 :                 drain->zone = zone;
    3247           0 :                 INIT_WORK(&drain->work, drain_local_pages_wq);
    3248           0 :                 queue_work_on(cpu, mm_percpu_wq, &drain->work);
    3249             :         }
    3250           0 :         for_each_cpu(cpu, &cpus_with_pcps)
    3251           0 :                 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
    3252             : 
    3253           0 :         mutex_unlock(&pcpu_drain_mutex);
    3254             : }
    3255             : 
    3256             : /*
    3257             :  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
    3258             :  *
    3259             :  * When zone parameter is non-NULL, spill just the single zone's pages.
    3260             :  *
    3261             :  * Note that this can be extremely slow as the draining happens in a workqueue.
    3262             :  */
    3263           0 : void drain_all_pages(struct zone *zone)
    3264             : {
    3265           0 :         __drain_all_pages(zone, false);
    3266           0 : }
    3267             : 
    3268             : #ifdef CONFIG_HIBERNATION
    3269             : 
    3270             : /*
    3271             :  * Touch the watchdog for every WD_PAGE_COUNT pages.
    3272             :  */
    3273             : #define WD_PAGE_COUNT   (128*1024)
    3274             : 
    3275             : void mark_free_pages(struct zone *zone)
    3276             : {
    3277             :         unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
    3278             :         unsigned long flags;
    3279             :         unsigned int order, t;
    3280             :         struct page *page;
    3281             : 
    3282             :         if (zone_is_empty(zone))
    3283             :                 return;
    3284             : 
    3285             :         spin_lock_irqsave(&zone->lock, flags);
    3286             : 
    3287             :         max_zone_pfn = zone_end_pfn(zone);
    3288             :         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
    3289             :                 if (pfn_valid(pfn)) {
    3290             :                         page = pfn_to_page(pfn);
    3291             : 
    3292             :                         if (!--page_count) {
    3293             :                                 touch_nmi_watchdog();
    3294             :                                 page_count = WD_PAGE_COUNT;
    3295             :                         }
    3296             : 
    3297             :                         if (page_zone(page) != zone)
    3298             :                                 continue;
    3299             : 
    3300             :                         if (!swsusp_page_is_forbidden(page))
    3301             :                                 swsusp_unset_page_free(page);
    3302             :                 }
    3303             : 
    3304             :         for_each_migratetype_order(order, t) {
    3305             :                 list_for_each_entry(page,
    3306             :                                 &zone->free_area[order].free_list[t], lru) {
    3307             :                         unsigned long i;
    3308             : 
    3309             :                         pfn = page_to_pfn(page);
    3310             :                         for (i = 0; i < (1UL << order); i++) {
    3311             :                                 if (!--page_count) {
    3312             :                                         touch_nmi_watchdog();
    3313             :                                         page_count = WD_PAGE_COUNT;
    3314             :                                 }
    3315             :                                 swsusp_set_page_free(pfn_to_page(pfn + i));
    3316             :                         }
    3317             :                 }
    3318             :         }
    3319             :         spin_unlock_irqrestore(&zone->lock, flags);
    3320             : }
    3321             : #endif /* CONFIG_PM */
    3322             : 
    3323           3 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
    3324             :                                                         unsigned int order)
    3325             : {
    3326             :         int migratetype;
    3327             : 
    3328           3 :         if (!free_pcp_prepare(page, order))
    3329             :                 return false;
    3330             : 
    3331           3 :         migratetype = get_pfnblock_migratetype(page, pfn);
    3332           6 :         set_pcppage_migratetype(page, migratetype);
    3333           3 :         return true;
    3334             : }
    3335             : 
    3336             : static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
    3337             :                        bool free_high)
    3338             : {
    3339             :         int min_nr_free, max_nr_free;
    3340             : 
    3341             :         /* Free everything if batch freeing high-order pages. */
    3342           0 :         if (unlikely(free_high))
    3343             :                 return pcp->count;
    3344             : 
    3345             :         /* Check for PCP disabled or boot pageset */
    3346           0 :         if (unlikely(high < batch))
    3347             :                 return 1;
    3348             : 
    3349             :         /* Leave at least pcp->batch pages on the list */
    3350           0 :         min_nr_free = batch;
    3351           0 :         max_nr_free = high - batch;
    3352             : 
    3353             :         /*
    3354             :          * Double the number of pages freed each time there is subsequent
    3355             :          * freeing of pages without any allocation.
    3356             :          */
    3357           0 :         batch <<= pcp->free_factor;
    3358           0 :         if (batch < max_nr_free)
    3359           0 :                 pcp->free_factor++;
    3360           0 :         batch = clamp(batch, min_nr_free, max_nr_free);
    3361             : 
    3362             :         return batch;
    3363             : }
    3364             : 
    3365             : static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
    3366             :                        bool free_high)
    3367             : {
    3368           3 :         int high = READ_ONCE(pcp->high);
    3369             : 
    3370           3 :         if (unlikely(!high || free_high))
    3371             :                 return 0;
    3372             : 
    3373           6 :         if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
    3374             :                 return high;
    3375             : 
    3376             :         /*
    3377             :          * If reclaim is active, limit the number of pages that can be
    3378             :          * stored on pcp lists
    3379             :          */
    3380           0 :         return min(READ_ONCE(pcp->batch) << 2, high);
    3381             : }
    3382             : 
    3383           3 : static void free_unref_page_commit(struct page *page, int migratetype,
    3384             :                                    unsigned int order)
    3385             : {
    3386           3 :         struct zone *zone = page_zone(page);
    3387             :         struct per_cpu_pages *pcp;
    3388             :         int high;
    3389             :         int pindex;
    3390             :         bool free_high;
    3391             : 
    3392           3 :         __count_vm_event(PGFREE);
    3393           3 :         pcp = this_cpu_ptr(zone->per_cpu_pageset);
    3394           6 :         pindex = order_to_pindex(migratetype, order);
    3395           6 :         list_add(&page->lru, &pcp->lists[pindex]);
    3396           3 :         pcp->count += 1 << order;
    3397             : 
    3398             :         /*
    3399             :          * As high-order pages other than THP's stored on PCP can contribute
    3400             :          * to fragmentation, limit the number stored when PCP is heavily
    3401             :          * freeing without allocation. The remainder after bulk freeing
    3402             :          * stops will be drained from vmstat refresh context.
    3403             :          */
    3404           3 :         free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
    3405             : 
    3406           6 :         high = nr_pcp_high(pcp, zone, free_high);
    3407           3 :         if (pcp->count >= high) {
    3408           0 :                 int batch = READ_ONCE(pcp->batch);
    3409             : 
    3410           0 :                 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
    3411             :         }
    3412           3 : }
    3413             : 
    3414             : /*
    3415             :  * Free a pcp page
    3416             :  */
    3417           3 : void free_unref_page(struct page *page, unsigned int order)
    3418             : {
    3419             :         unsigned long flags;
    3420           3 :         unsigned long pfn = page_to_pfn(page);
    3421             :         int migratetype;
    3422             : 
    3423           3 :         if (!free_unref_page_prepare(page, pfn, order))
    3424             :                 return;
    3425             : 
    3426             :         /*
    3427             :          * We only track unmovable, reclaimable and movable on pcp lists.
    3428             :          * Place ISOLATE pages on the isolated list because they are being
    3429             :          * offlined but treat HIGHATOMIC as movable pages so we can get those
    3430             :          * areas back if necessary. Otherwise, we may have to free
    3431             :          * excessively into the page allocator
    3432             :          */
    3433           6 :         migratetype = get_pcppage_migratetype(page);
    3434           3 :         if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
    3435             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    3436             :                         free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
    3437             :                         return;
    3438             :                 }
    3439           0 :                 migratetype = MIGRATE_MOVABLE;
    3440             :         }
    3441             : 
    3442           3 :         local_lock_irqsave(&pagesets.lock, flags);
    3443           3 :         free_unref_page_commit(page, migratetype, order);
    3444           3 :         local_unlock_irqrestore(&pagesets.lock, flags);
    3445             : }
    3446             : 
    3447             : /*
    3448             :  * Free a list of 0-order pages
    3449             :  */
    3450           0 : void free_unref_page_list(struct list_head *list)
    3451             : {
    3452             :         struct page *page, *next;
    3453             :         unsigned long flags;
    3454           0 :         int batch_count = 0;
    3455             :         int migratetype;
    3456             : 
    3457             :         /* Prepare pages for freeing */
    3458           0 :         list_for_each_entry_safe(page, next, list, lru) {
    3459           0 :                 unsigned long pfn = page_to_pfn(page);
    3460           0 :                 if (!free_unref_page_prepare(page, pfn, 0)) {
    3461           0 :                         list_del(&page->lru);
    3462           0 :                         continue;
    3463             :                 }
    3464             : 
    3465             :                 /*
    3466             :                  * Free isolated pages directly to the allocator, see
    3467             :                  * comment in free_unref_page.
    3468             :                  */
    3469             :                 migratetype = get_pcppage_migratetype(page);
    3470             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    3471             :                         list_del(&page->lru);
    3472             :                         free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
    3473             :                         continue;
    3474             :                 }
    3475             :         }
    3476             : 
    3477           0 :         local_lock_irqsave(&pagesets.lock, flags);
    3478           0 :         list_for_each_entry_safe(page, next, list, lru) {
    3479             :                 /*
    3480             :                  * Non-isolated types over MIGRATE_PCPTYPES get added
    3481             :                  * to the MIGRATE_MOVABLE pcp list.
    3482             :                  */
    3483           0 :                 migratetype = get_pcppage_migratetype(page);
    3484           0 :                 if (unlikely(migratetype >= MIGRATE_PCPTYPES))
    3485           0 :                         migratetype = MIGRATE_MOVABLE;
    3486             : 
    3487           0 :                 trace_mm_page_free_batched(page);
    3488           0 :                 free_unref_page_commit(page, migratetype, 0);
    3489             : 
    3490             :                 /*
    3491             :                  * Guard against excessive IRQ disabled times when we get
    3492             :                  * a large list of pages to free.
    3493             :                  */
    3494           0 :                 if (++batch_count == SWAP_CLUSTER_MAX) {
    3495           0 :                         local_unlock_irqrestore(&pagesets.lock, flags);
    3496           0 :                         batch_count = 0;
    3497           0 :                         local_lock_irqsave(&pagesets.lock, flags);
    3498             :                 }
    3499             :         }
    3500           0 :         local_unlock_irqrestore(&pagesets.lock, flags);
    3501           0 : }
    3502             : 
    3503             : /*
    3504             :  * split_page takes a non-compound higher-order page, and splits it into
    3505             :  * n (1<<order) sub-pages: page[0..n]
    3506             :  * Each sub-page must be freed individually.
    3507             :  *
    3508             :  * Note: this is probably too low level an operation for use in drivers.
    3509             :  * Please consult with lkml before using this in your driver.
    3510             :  */
    3511           0 : void split_page(struct page *page, unsigned int order)
    3512             : {
    3513             :         int i;
    3514             : 
    3515             :         VM_BUG_ON_PAGE(PageCompound(page), page);
    3516             :         VM_BUG_ON_PAGE(!page_count(page), page);
    3517             : 
    3518          15 :         for (i = 1; i < (1 << order); i++)
    3519          30 :                 set_page_refcounted(page + i);
    3520           0 :         split_page_owner(page, 1 << order);
    3521           0 :         split_page_memcg(page, 1 << order);
    3522           0 : }
    3523             : EXPORT_SYMBOL_GPL(split_page);
    3524             : 
    3525           0 : int __isolate_free_page(struct page *page, unsigned int order)
    3526             : {
    3527             :         unsigned long watermark;
    3528             :         struct zone *zone;
    3529             :         int mt;
    3530             : 
    3531           0 :         BUG_ON(!PageBuddy(page));
    3532             : 
    3533           0 :         zone = page_zone(page);
    3534           0 :         mt = get_pageblock_migratetype(page);
    3535             : 
    3536           0 :         if (!is_migrate_isolate(mt)) {
    3537             :                 /*
    3538             :                  * Obey watermarks as if the page was being allocated. We can
    3539             :                  * emulate a high-order watermark check with a raised order-0
    3540             :                  * watermark, because we already know our high-order page
    3541             :                  * exists.
    3542             :                  */
    3543           0 :                 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
    3544           0 :                 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
    3545             :                         return 0;
    3546             : 
    3547           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    3548             :         }
    3549             : 
    3550             :         /* Remove page from free list */
    3551             : 
    3552           0 :         del_page_from_free_list(page, zone, order);
    3553             : 
    3554             :         /*
    3555             :          * Set the pageblock if the isolated page is at least half of a
    3556             :          * pageblock
    3557             :          */
    3558           0 :         if (order >= pageblock_order - 1) {
    3559           0 :                 struct page *endpage = page + (1 << order) - 1;
    3560           0 :                 for (; page < endpage; page += pageblock_nr_pages) {
    3561           0 :                         int mt = get_pageblock_migratetype(page);
    3562             :                         /*
    3563             :                          * Only change normal pageblocks (i.e., they can merge
    3564             :                          * with others)
    3565             :                          */
    3566           0 :                         if (migratetype_is_mergeable(mt))
    3567           0 :                                 set_pageblock_migratetype(page,
    3568             :                                                           MIGRATE_MOVABLE);
    3569             :                 }
    3570             :         }
    3571             : 
    3572             : 
    3573           0 :         return 1UL << order;
    3574             : }
    3575             : 
    3576             : /**
    3577             :  * __putback_isolated_page - Return a now-isolated page back where we got it
    3578             :  * @page: Page that was isolated
    3579             :  * @order: Order of the isolated page
    3580             :  * @mt: The page's pageblock's migratetype
    3581             :  *
    3582             :  * This function is meant to return a page pulled from the free lists via
    3583             :  * __isolate_free_page back to the free lists they were pulled from.
    3584             :  */
    3585           0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
    3586             : {
    3587           0 :         struct zone *zone = page_zone(page);
    3588             : 
    3589             :         /* zone lock should be held when this function is called */
    3590             :         lockdep_assert_held(&zone->lock);
    3591             : 
    3592             :         /* Return isolated page to tail of freelist. */
    3593           0 :         __free_one_page(page, page_to_pfn(page), zone, order, mt,
    3594             :                         FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
    3595           0 : }
    3596             : 
    3597             : /*
    3598             :  * Update NUMA hit/miss statistics
    3599             :  *
    3600             :  * Must be called with interrupts disabled.
    3601             :  */
    3602             : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
    3603             :                                    long nr_account)
    3604             : {
    3605             : #ifdef CONFIG_NUMA
    3606             :         enum numa_stat_item local_stat = NUMA_LOCAL;
    3607             : 
    3608             :         /* skip numa counters update if numa stats is disabled */
    3609             :         if (!static_branch_likely(&vm_numa_stat_key))
    3610             :                 return;
    3611             : 
    3612             :         if (zone_to_nid(z) != numa_node_id())
    3613             :                 local_stat = NUMA_OTHER;
    3614             : 
    3615             :         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
    3616             :                 __count_numa_events(z, NUMA_HIT, nr_account);
    3617             :         else {
    3618             :                 __count_numa_events(z, NUMA_MISS, nr_account);
    3619             :                 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
    3620             :         }
    3621             :         __count_numa_events(z, local_stat, nr_account);
    3622             : #endif
    3623             : }
    3624             : 
    3625             : /* Remove page from the per-cpu list, caller must protect the list */
    3626             : static inline
    3627         520 : struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
    3628             :                         int migratetype,
    3629             :                         unsigned int alloc_flags,
    3630             :                         struct per_cpu_pages *pcp,
    3631             :                         struct list_head *list)
    3632             : {
    3633             :         struct page *page;
    3634             : 
    3635             :         do {
    3636         520 :                 if (list_empty(list)) {
    3637          27 :                         int batch = READ_ONCE(pcp->batch);
    3638             :                         int alloced;
    3639             : 
    3640             :                         /*
    3641             :                          * Scale batch relative to order if batch implies
    3642             :                          * free pages can be stored on the PCP. Batch can
    3643             :                          * be 1 for small zones or for boot pagesets which
    3644             :                          * should never store free pages as the pages may
    3645             :                          * belong to arbitrary zones.
    3646             :                          */
    3647          27 :                         if (batch > 1)
    3648          16 :                                 batch = max(batch >> order, 2);
    3649          27 :                         alloced = rmqueue_bulk(zone, order,
    3650             :                                         batch, list,
    3651             :                                         migratetype, alloc_flags);
    3652             : 
    3653          27 :                         pcp->count += alloced << order;
    3654          27 :                         if (unlikely(list_empty(list)))
    3655             :                                 return NULL;
    3656             :                 }
    3657             : 
    3658         520 :                 page = list_first_entry(list, struct page, lru);
    3659        1040 :                 list_del(&page->lru);
    3660         520 :                 pcp->count -= 1 << order;
    3661         520 :         } while (check_new_pcp(page, order));
    3662             : 
    3663         520 :         return page;
    3664             : }
    3665             : 
    3666             : /* Lock and remove page from the per-cpu list */
    3667         460 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
    3668             :                         struct zone *zone, unsigned int order,
    3669             :                         gfp_t gfp_flags, int migratetype,
    3670             :                         unsigned int alloc_flags)
    3671             : {
    3672             :         struct per_cpu_pages *pcp;
    3673             :         struct list_head *list;
    3674             :         struct page *page;
    3675             :         unsigned long flags;
    3676             : 
    3677         460 :         local_lock_irqsave(&pagesets.lock, flags);
    3678             : 
    3679             :         /*
    3680             :          * On allocation, reduce the number of pages that are batch freed.
    3681             :          * See nr_pcp_free() where free_factor is increased for subsequent
    3682             :          * frees.
    3683             :          */
    3684         460 :         pcp = this_cpu_ptr(zone->per_cpu_pageset);
    3685         460 :         pcp->free_factor >>= 1;
    3686         920 :         list = &pcp->lists[order_to_pindex(migratetype, order)];
    3687         460 :         page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
    3688         920 :         local_unlock_irqrestore(&pagesets.lock, flags);
    3689         460 :         if (page) {
    3690         920 :                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
    3691             :                 zone_statistics(preferred_zone, zone, 1);
    3692             :         }
    3693         460 :         return page;
    3694             : }
    3695             : 
    3696             : /*
    3697             :  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
    3698             :  */
    3699             : static inline
    3700         468 : struct page *rmqueue(struct zone *preferred_zone,
    3701             :                         struct zone *zone, unsigned int order,
    3702             :                         gfp_t gfp_flags, unsigned int alloc_flags,
    3703             :                         int migratetype)
    3704             : {
    3705             :         unsigned long flags;
    3706             :         struct page *page;
    3707             : 
    3708         468 :         if (likely(pcp_allowed_order(order))) {
    3709             :                 /*
    3710             :                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
    3711             :                  * we need to skip it when CMA area isn't allowed.
    3712             :                  */
    3713             :                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
    3714             :                                 migratetype != MIGRATE_MOVABLE) {
    3715         460 :                         page = rmqueue_pcplist(preferred_zone, zone, order,
    3716             :                                         gfp_flags, migratetype, alloc_flags);
    3717             :                         goto out;
    3718             :                 }
    3719             :         }
    3720             : 
    3721             :         /*
    3722             :          * We most definitely don't want callers attempting to
    3723             :          * allocate greater than order-1 page units with __GFP_NOFAIL.
    3724             :          */
    3725           8 :         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
    3726             : 
    3727             :         do {
    3728           8 :                 page = NULL;
    3729           8 :                 spin_lock_irqsave(&zone->lock, flags);
    3730             :                 /*
    3731             :                  * order-0 request can reach here when the pcplist is skipped
    3732             :                  * due to non-CMA allocation context. HIGHATOMIC area is
    3733             :                  * reserved for high-order atomic allocation, so order-0
    3734             :                  * request should skip it.
    3735             :                  */
    3736           8 :                 if (order > 0 && alloc_flags & ALLOC_HARDER) {
    3737             :                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    3738             :                         if (page)
    3739             :                                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
    3740             :                 }
    3741           8 :                 if (!page) {
    3742           8 :                         page = __rmqueue(zone, order, migratetype, alloc_flags);
    3743           8 :                         if (!page)
    3744             :                                 goto failed;
    3745             :                 }
    3746          16 :                 __mod_zone_freepage_state(zone, -(1 << order),
    3747             :                                           get_pcppage_migratetype(page));
    3748           8 :                 spin_unlock_irqrestore(&zone->lock, flags);
    3749           8 :         } while (check_new_pages(page, order));
    3750             : 
    3751          16 :         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    3752             :         zone_statistics(preferred_zone, zone, 1);
    3753             : 
    3754             : out:
    3755             :         /* Separate test+clear to avoid unnecessary atomics */
    3756         936 :         if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
    3757           0 :                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    3758           0 :                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
    3759             :         }
    3760             : 
    3761             :         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
    3762             :         return page;
    3763             : 
    3764             : failed:
    3765           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    3766             :         return NULL;
    3767             : }
    3768             : 
    3769             : #ifdef CONFIG_FAIL_PAGE_ALLOC
    3770             : 
    3771             : static struct {
    3772             :         struct fault_attr attr;
    3773             : 
    3774             :         bool ignore_gfp_highmem;
    3775             :         bool ignore_gfp_reclaim;
    3776             :         u32 min_order;
    3777             : } fail_page_alloc = {
    3778             :         .attr = FAULT_ATTR_INITIALIZER,
    3779             :         .ignore_gfp_reclaim = true,
    3780             :         .ignore_gfp_highmem = true,
    3781             :         .min_order = 1,
    3782             : };
    3783             : 
    3784             : static int __init setup_fail_page_alloc(char *str)
    3785             : {
    3786             :         return setup_fault_attr(&fail_page_alloc.attr, str);
    3787             : }
    3788             : __setup("fail_page_alloc=", setup_fail_page_alloc);
    3789             : 
    3790             : static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3791             : {
    3792             :         if (order < fail_page_alloc.min_order)
    3793             :                 return false;
    3794             :         if (gfp_mask & __GFP_NOFAIL)
    3795             :                 return false;
    3796             :         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
    3797             :                 return false;
    3798             :         if (fail_page_alloc.ignore_gfp_reclaim &&
    3799             :                         (gfp_mask & __GFP_DIRECT_RECLAIM))
    3800             :                 return false;
    3801             : 
    3802             :         return should_fail(&fail_page_alloc.attr, 1 << order);
    3803             : }
    3804             : 
    3805             : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
    3806             : 
    3807             : static int __init fail_page_alloc_debugfs(void)
    3808             : {
    3809             :         umode_t mode = S_IFREG | 0600;
    3810             :         struct dentry *dir;
    3811             : 
    3812             :         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
    3813             :                                         &fail_page_alloc.attr);
    3814             : 
    3815             :         debugfs_create_bool("ignore-gfp-wait", mode, dir,
    3816             :                             &fail_page_alloc.ignore_gfp_reclaim);
    3817             :         debugfs_create_bool("ignore-gfp-highmem", mode, dir,
    3818             :                             &fail_page_alloc.ignore_gfp_highmem);
    3819             :         debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
    3820             : 
    3821             :         return 0;
    3822             : }
    3823             : 
    3824             : late_initcall(fail_page_alloc_debugfs);
    3825             : 
    3826             : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
    3827             : 
    3828             : #else /* CONFIG_FAIL_PAGE_ALLOC */
    3829             : 
    3830             : static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3831             : {
    3832             :         return false;
    3833             : }
    3834             : 
    3835             : #endif /* CONFIG_FAIL_PAGE_ALLOC */
    3836             : 
    3837         483 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3838             : {
    3839         483 :         return __should_fail_alloc_page(gfp_mask, order);
    3840             : }
    3841             : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
    3842             : 
    3843             : static inline long __zone_watermark_unusable_free(struct zone *z,
    3844             :                                 unsigned int order, unsigned int alloc_flags)
    3845             : {
    3846         484 :         const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
    3847         484 :         long unusable_free = (1 << order) - 1;
    3848             : 
    3849             :         /*
    3850             :          * If the caller does not have rights to ALLOC_HARDER then subtract
    3851             :          * the high-atomic reserves. This will over-estimate the size of the
    3852             :          * atomic reserve but it avoids a search.
    3853             :          */
    3854         484 :         if (likely(!alloc_harder))
    3855         484 :                 unusable_free += z->nr_reserved_highatomic;
    3856             : 
    3857             : #ifdef CONFIG_CMA
    3858             :         /* If allocation can't use CMA areas don't use free CMA pages */
    3859             :         if (!(alloc_flags & ALLOC_CMA))
    3860             :                 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
    3861             : #endif
    3862             : 
    3863             :         return unusable_free;
    3864             : }
    3865             : 
    3866             : /*
    3867             :  * Return true if free base pages are above 'mark'. For high-order checks it
    3868             :  * will return true of the order-0 watermark is reached and there is at least
    3869             :  * one free page of a suitable size. Checking now avoids taking the zone lock
    3870             :  * to check in the allocation paths if no pages are free.
    3871             :  */
    3872         113 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3873             :                          int highest_zoneidx, unsigned int alloc_flags,
    3874             :                          long free_pages)
    3875             : {
    3876         113 :         long min = mark;
    3877             :         int o;
    3878         113 :         const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
    3879             : 
    3880             :         /* free_pages may go negative - that's OK */
    3881         226 :         free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
    3882             : 
    3883         113 :         if (alloc_flags & ALLOC_HIGH)
    3884           0 :                 min -= min / 2;
    3885             : 
    3886         113 :         if (unlikely(alloc_harder)) {
    3887             :                 /*
    3888             :                  * OOM victims can try even harder than normal ALLOC_HARDER
    3889             :                  * users on the grounds that it's definitely going to be in
    3890             :                  * the exit path shortly and free memory. Any allocation it
    3891             :                  * makes during the free path will be small and short-lived.
    3892             :                  */
    3893           0 :                 if (alloc_flags & ALLOC_OOM)
    3894           0 :                         min -= min / 2;
    3895             :                 else
    3896           0 :                         min -= min / 4;
    3897             :         }
    3898             : 
    3899             :         /*
    3900             :          * Check watermarks for an order-0 allocation request. If these
    3901             :          * are not met, then a high-order request also cannot go ahead
    3902             :          * even if a suitable page happened to be free.
    3903             :          */
    3904         113 :         if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
    3905             :                 return false;
    3906             : 
    3907             :         /* If this is an order-0 request then the watermark is fine */
    3908         113 :         if (!order)
    3909             :                 return true;
    3910             : 
    3911             :         /* For a high-order request, check at least one suitable page is free */
    3912         120 :         for (o = order; o < MAX_ORDER; o++) {
    3913         120 :                 struct free_area *area = &z->free_area[o];
    3914             :                 int mt;
    3915             : 
    3916         120 :                 if (!area->nr_free)
    3917           8 :                         continue;
    3918             : 
    3919          35 :                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
    3920         147 :                         if (!free_area_empty(area, mt))
    3921             :                                 return true;
    3922             :                 }
    3923             : 
    3924             : #ifdef CONFIG_CMA
    3925             :                 if ((alloc_flags & ALLOC_CMA) &&
    3926             :                     !free_area_empty(area, MIGRATE_CMA)) {
    3927             :                         return true;
    3928             :                 }
    3929             : #endif
    3930           0 :                 if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
    3931             :                         return true;
    3932             :         }
    3933             :         return false;
    3934             : }
    3935             : 
    3936           0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3937             :                       int highest_zoneidx, unsigned int alloc_flags)
    3938             : {
    3939           0 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    3940           0 :                                         zone_page_state(z, NR_FREE_PAGES));
    3941             : }
    3942             : 
    3943         483 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
    3944             :                                 unsigned long mark, int highest_zoneidx,
    3945             :                                 unsigned int alloc_flags, gfp_t gfp_mask)
    3946             : {
    3947             :         long free_pages;
    3948             : 
    3949         483 :         free_pages = zone_page_state(z, NR_FREE_PAGES);
    3950             : 
    3951             :         /*
    3952             :          * Fast check for order-0 only. If this fails then the reserves
    3953             :          * need to be calculated.
    3954             :          */
    3955         483 :         if (!order) {
    3956             :                 long fast_free;
    3957             : 
    3958         371 :                 fast_free = free_pages;
    3959         742 :                 fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
    3960         371 :                 if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
    3961             :                         return true;
    3962             :         }
    3963             : 
    3964         112 :         if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    3965             :                                         free_pages))
    3966             :                 return true;
    3967             :         /*
    3968             :          * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
    3969             :          * when checking the min watermark. The min watermark is the
    3970             :          * point where boosting is ignored so that kswapd is woken up
    3971             :          * when below the low watermark.
    3972             :          */
    3973           0 :         if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
    3974             :                 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
    3975           0 :                 mark = z->_watermark[WMARK_MIN];
    3976           0 :                 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
    3977             :                                         alloc_flags, free_pages);
    3978             :         }
    3979             : 
    3980             :         return false;
    3981             : }
    3982             : 
    3983           1 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
    3984             :                         unsigned long mark, int highest_zoneidx)
    3985             : {
    3986           1 :         long free_pages = zone_page_state(z, NR_FREE_PAGES);
    3987             : 
    3988           1 :         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
    3989           0 :                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
    3990             : 
    3991           1 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
    3992             :                                                                 free_pages);
    3993             : }
    3994             : 
    3995             : #ifdef CONFIG_NUMA
    3996             : int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
    3997             : 
    3998             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3999             : {
    4000             :         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
    4001             :                                 node_reclaim_distance;
    4002             : }
    4003             : #else   /* CONFIG_NUMA */
    4004             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    4005             : {
    4006             :         return true;
    4007             : }
    4008             : #endif  /* CONFIG_NUMA */
    4009             : 
    4010             : /*
    4011             :  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
    4012             :  * fragmentation is subtle. If the preferred zone was HIGHMEM then
    4013             :  * premature use of a lower zone may cause lowmem pressure problems that
    4014             :  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
    4015             :  * probably too small. It only makes sense to spread allocations to avoid
    4016             :  * fragmentation between the Normal and DMA32 zones.
    4017             :  */
    4018             : static inline unsigned int
    4019             : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
    4020             : {
    4021             :         unsigned int alloc_flags;
    4022             : 
    4023             :         /*
    4024             :          * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    4025             :          * to save a branch.
    4026             :          */
    4027         468 :         alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
    4028             : 
    4029             : #ifdef CONFIG_ZONE_DMA32
    4030             :         if (!zone)
    4031             :                 return alloc_flags;
    4032             : 
    4033             :         if (zone_idx(zone) != ZONE_NORMAL)
    4034             :                 return alloc_flags;
    4035             : 
    4036             :         /*
    4037             :          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
    4038             :          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
    4039             :          * on UMA that if Normal is populated then so is DMA32.
    4040             :          */
    4041             :         BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
    4042             :         if (nr_online_nodes > 1 && !populated_zone(--zone))
    4043             :                 return alloc_flags;
    4044             : 
    4045             :         alloc_flags |= ALLOC_NOFRAGMENT;
    4046             : #endif /* CONFIG_ZONE_DMA32 */
    4047             :         return alloc_flags;
    4048             : }
    4049             : 
    4050             : /* Must be called after current_gfp_context() which can change gfp_mask */
    4051             : static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
    4052             :                                                   unsigned int alloc_flags)
    4053             : {
    4054             : #ifdef CONFIG_CMA
    4055             :         if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
    4056             :                 alloc_flags |= ALLOC_CMA;
    4057             : #endif
    4058             :         return alloc_flags;
    4059             : }
    4060             : 
    4061             : /*
    4062             :  * get_page_from_freelist goes through the zonelist trying to allocate
    4063             :  * a page.
    4064             :  */
    4065             : static struct page *
    4066         468 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
    4067             :                                                 const struct alloc_context *ac)
    4068             : {
    4069             :         struct zoneref *z;
    4070             :         struct zone *zone;
    4071         468 :         struct pglist_data *last_pgdat_dirty_limit = NULL;
    4072             :         bool no_fallback;
    4073             : 
    4074             : retry:
    4075             :         /*
    4076             :          * Scan zonelist, looking for a zone with enough free.
    4077             :          * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
    4078             :          */
    4079         468 :         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
    4080         468 :         z = ac->preferred_zoneref;
    4081         468 :         for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
    4082             :                                         ac->nodemask) {
    4083             :                 struct page *page;
    4084             :                 unsigned long mark;
    4085             : 
    4086             :                 if (cpusets_enabled() &&
    4087             :                         (alloc_flags & ALLOC_CPUSET) &&
    4088             :                         !__cpuset_zone_allowed(zone, gfp_mask))
    4089             :                                 continue;
    4090             :                 /*
    4091             :                  * When allocating a page cache page for writing, we
    4092             :                  * want to get it from a node that is within its dirty
    4093             :                  * limit, such that no single node holds more than its
    4094             :                  * proportional share of globally allowed dirty pages.
    4095             :                  * The dirty limits take into account the node's
    4096             :                  * lowmem reserves and high watermark so that kswapd
    4097             :                  * should be able to balance it without having to
    4098             :                  * write pages from its LRU list.
    4099             :                  *
    4100             :                  * XXX: For now, allow allocations to potentially
    4101             :                  * exceed the per-node dirty limit in the slowpath
    4102             :                  * (spread_dirty_pages unset) before going into reclaim,
    4103             :                  * which is important when on a NUMA setup the allowed
    4104             :                  * nodes are together not big enough to reach the
    4105             :                  * global limit.  The proper fix for these situations
    4106             :                  * will require awareness of nodes in the
    4107             :                  * dirty-throttling and the flusher threads.
    4108             :                  */
    4109         468 :                 if (ac->spread_dirty_pages) {
    4110           0 :                         if (last_pgdat_dirty_limit == zone->zone_pgdat)
    4111           0 :                                 continue;
    4112             : 
    4113           0 :                         if (!node_dirty_ok(zone->zone_pgdat)) {
    4114           0 :                                 last_pgdat_dirty_limit = zone->zone_pgdat;
    4115           0 :                                 continue;
    4116             :                         }
    4117             :                 }
    4118             : 
    4119             :                 if (no_fallback && nr_online_nodes > 1 &&
    4120             :                     zone != ac->preferred_zoneref->zone) {
    4121             :                         int local_nid;
    4122             : 
    4123             :                         /*
    4124             :                          * If moving to a remote node, retry but allow
    4125             :                          * fragmenting fallbacks. Locality is more important
    4126             :                          * than fragmentation avoidance.
    4127             :                          */
    4128             :                         local_nid = zone_to_nid(ac->preferred_zoneref->zone);
    4129             :                         if (zone_to_nid(zone) != local_nid) {
    4130             :                                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    4131             :                                 goto retry;
    4132             :                         }
    4133             :                 }
    4134             : 
    4135         468 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
    4136         936 :                 if (!zone_watermark_fast(zone, order, mark,
    4137         468 :                                        ac->highest_zoneidx, alloc_flags,
    4138             :                                        gfp_mask)) {
    4139             :                         int ret;
    4140             : 
    4141             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    4142             :                         /*
    4143             :                          * Watermark failed for this zone, but see if we can
    4144             :                          * grow this zone if it contains deferred pages.
    4145             :                          */
    4146             :                         if (static_branch_unlikely(&deferred_pages)) {
    4147             :                                 if (_deferred_grow_zone(zone, order))
    4148             :                                         goto try_this_zone;
    4149             :                         }
    4150             : #endif
    4151             :                         /* Checked here to keep the fast path fast */
    4152             :                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
    4153           0 :                         if (alloc_flags & ALLOC_NO_WATERMARKS)
    4154             :                                 goto try_this_zone;
    4155             : 
    4156             :                         if (!node_reclaim_enabled() ||
    4157             :                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
    4158           0 :                                 continue;
    4159             : 
    4160             :                         ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
    4161             :                         switch (ret) {
    4162             :                         case NODE_RECLAIM_NOSCAN:
    4163             :                                 /* did not scan */
    4164             :                                 continue;
    4165             :                         case NODE_RECLAIM_FULL:
    4166             :                                 /* scanned but unreclaimable */
    4167             :                                 continue;
    4168             :                         default:
    4169             :                                 /* did we reclaim enough */
    4170             :                                 if (zone_watermark_ok(zone, order, mark,
    4171             :                                         ac->highest_zoneidx, alloc_flags))
    4172             :                                         goto try_this_zone;
    4173             : 
    4174             :                                 continue;
    4175             :                         }
    4176             :                 }
    4177             : 
    4178             : try_this_zone:
    4179         468 :                 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
    4180             :                                 gfp_mask, alloc_flags, ac->migratetype);
    4181         468 :                 if (page) {
    4182         468 :                         prep_new_page(page, order, gfp_mask, alloc_flags);
    4183             : 
    4184             :                         /*
    4185             :                          * If this is a high-order atomic allocation then check
    4186             :                          * if the pageblock should be reserved for the future
    4187             :                          */
    4188         468 :                         if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
    4189           0 :                                 reserve_highatomic_pageblock(page, zone, order);
    4190             : 
    4191             :                         return page;
    4192             :                 } else {
    4193             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    4194             :                         /* Try again if zone has deferred pages */
    4195             :                         if (static_branch_unlikely(&deferred_pages)) {
    4196             :                                 if (_deferred_grow_zone(zone, order))
    4197             :                                         goto try_this_zone;
    4198             :                         }
    4199             : #endif
    4200             :                 }
    4201             :         }
    4202             : 
    4203             :         /*
    4204             :          * It's possible on a UMA machine to get through all zones that are
    4205             :          * fragmented. If avoiding fragmentation, reset and try again.
    4206             :          */
    4207             :         if (no_fallback) {
    4208             :                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    4209             :                 goto retry;
    4210             :         }
    4211             : 
    4212             :         return NULL;
    4213             : }
    4214             : 
    4215           0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
    4216             : {
    4217           0 :         unsigned int filter = SHOW_MEM_FILTER_NODES;
    4218             : 
    4219             :         /*
    4220             :          * This documents exceptions given to allocations in certain
    4221             :          * contexts that are allowed to allocate outside current's set
    4222             :          * of allowed nodes.
    4223             :          */
    4224           0 :         if (!(gfp_mask & __GFP_NOMEMALLOC))
    4225           0 :                 if (tsk_is_oom_victim(current) ||
    4226           0 :                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
    4227             :                         filter &= ~SHOW_MEM_FILTER_NODES;
    4228           0 :         if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
    4229           0 :                 filter &= ~SHOW_MEM_FILTER_NODES;
    4230             : 
    4231           0 :         show_mem(filter, nodemask);
    4232           0 : }
    4233             : 
    4234           0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
    4235             : {
    4236             :         struct va_format vaf;
    4237             :         va_list args;
    4238             :         static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
    4239             : 
    4240           0 :         if ((gfp_mask & __GFP_NOWARN) ||
    4241           0 :              !__ratelimit(&nopage_rs) ||
    4242           0 :              ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
    4243           0 :                 return;
    4244             : 
    4245           0 :         va_start(args, fmt);
    4246           0 :         vaf.fmt = fmt;
    4247           0 :         vaf.va = &args;
    4248           0 :         pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
    4249             :                         current->comm, &vaf, gfp_mask, &gfp_mask,
    4250             :                         nodemask_pr_args(nodemask));
    4251           0 :         va_end(args);
    4252             : 
    4253             :         cpuset_print_current_mems_allowed();
    4254           0 :         pr_cont("\n");
    4255           0 :         dump_stack();
    4256           0 :         warn_alloc_show_mem(gfp_mask, nodemask);
    4257             : }
    4258             : 
    4259             : static inline struct page *
    4260           0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
    4261             :                               unsigned int alloc_flags,
    4262             :                               const struct alloc_context *ac)
    4263             : {
    4264             :         struct page *page;
    4265             : 
    4266           0 :         page = get_page_from_freelist(gfp_mask, order,
    4267           0 :                         alloc_flags|ALLOC_CPUSET, ac);
    4268             :         /*
    4269             :          * fallback to ignore cpuset restriction if our nodes
    4270             :          * are depleted
    4271             :          */
    4272           0 :         if (!page)
    4273           0 :                 page = get_page_from_freelist(gfp_mask, order,
    4274             :                                 alloc_flags, ac);
    4275             : 
    4276           0 :         return page;
    4277             : }
    4278             : 
    4279             : static inline struct page *
    4280           0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
    4281             :         const struct alloc_context *ac, unsigned long *did_some_progress)
    4282             : {
    4283           0 :         struct oom_control oc = {
    4284           0 :                 .zonelist = ac->zonelist,
    4285           0 :                 .nodemask = ac->nodemask,
    4286             :                 .memcg = NULL,
    4287             :                 .gfp_mask = gfp_mask,
    4288             :                 .order = order,
    4289             :         };
    4290             :         struct page *page;
    4291             : 
    4292           0 :         *did_some_progress = 0;
    4293             : 
    4294             :         /*
    4295             :          * Acquire the oom lock.  If that fails, somebody else is
    4296             :          * making progress for us.
    4297             :          */
    4298           0 :         if (!mutex_trylock(&oom_lock)) {
    4299           0 :                 *did_some_progress = 1;
    4300           0 :                 schedule_timeout_uninterruptible(1);
    4301           0 :                 return NULL;
    4302             :         }
    4303             : 
    4304             :         /*
    4305             :          * Go through the zonelist yet one more time, keep very high watermark
    4306             :          * here, this is only to catch a parallel oom killing, we must fail if
    4307             :          * we're still under heavy pressure. But make sure that this reclaim
    4308             :          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
    4309             :          * allocation which will never fail due to oom_lock already held.
    4310             :          */
    4311           0 :         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
    4312             :                                       ~__GFP_DIRECT_RECLAIM, order,
    4313             :                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
    4314           0 :         if (page)
    4315             :                 goto out;
    4316             : 
    4317             :         /* Coredumps can quickly deplete all memory reserves */
    4318           0 :         if (current->flags & PF_DUMPCORE)
    4319             :                 goto out;
    4320             :         /* The OOM killer will not help higher order allocs */
    4321           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    4322             :                 goto out;
    4323             :         /*
    4324             :          * We have already exhausted all our reclaim opportunities without any
    4325             :          * success so it is time to admit defeat. We will skip the OOM killer
    4326             :          * because it is very likely that the caller has a more reasonable
    4327             :          * fallback than shooting a random task.
    4328             :          *
    4329             :          * The OOM killer may not free memory on a specific node.
    4330             :          */
    4331           0 :         if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
    4332             :                 goto out;
    4333             :         /* The OOM killer does not needlessly kill tasks for lowmem */
    4334             :         if (ac->highest_zoneidx < ZONE_NORMAL)
    4335             :                 goto out;
    4336           0 :         if (pm_suspended_storage())
    4337             :                 goto out;
    4338             :         /*
    4339             :          * XXX: GFP_NOFS allocations should rather fail than rely on
    4340             :          * other request to make a forward progress.
    4341             :          * We are in an unfortunate situation where out_of_memory cannot
    4342             :          * do much for this context but let's try it to at least get
    4343             :          * access to memory reserved if the current task is killed (see
    4344             :          * out_of_memory). Once filesystems are ready to handle allocation
    4345             :          * failures more gracefully we should just bail out here.
    4346             :          */
    4347             : 
    4348             :         /* Exhausted what can be done so it's blame time */
    4349           0 :         if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
    4350           0 :                 *did_some_progress = 1;
    4351             : 
    4352             :                 /*
    4353             :                  * Help non-failing allocations by giving them access to memory
    4354             :                  * reserves
    4355             :                  */
    4356           0 :                 if (gfp_mask & __GFP_NOFAIL)
    4357           0 :                         page = __alloc_pages_cpuset_fallback(gfp_mask, order,
    4358             :                                         ALLOC_NO_WATERMARKS, ac);
    4359             :         }
    4360             : out:
    4361           0 :         mutex_unlock(&oom_lock);
    4362           0 :         return page;
    4363             : }
    4364             : 
    4365             : /*
    4366             :  * Maximum number of compaction retries with a progress before OOM
    4367             :  * killer is consider as the only way to move forward.
    4368             :  */
    4369             : #define MAX_COMPACT_RETRIES 16
    4370             : 
    4371             : #ifdef CONFIG_COMPACTION
    4372             : /* Try memory compaction for high-order allocations before reclaim */
    4373             : static struct page *
    4374           0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    4375             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4376             :                 enum compact_priority prio, enum compact_result *compact_result)
    4377             : {
    4378           0 :         struct page *page = NULL;
    4379             :         unsigned long pflags;
    4380             :         unsigned int noreclaim_flag;
    4381             : 
    4382           0 :         if (!order)
    4383             :                 return NULL;
    4384             : 
    4385           0 :         psi_memstall_enter(&pflags);
    4386             :         delayacct_compact_start();
    4387           0 :         noreclaim_flag = memalloc_noreclaim_save();
    4388             : 
    4389           0 :         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
    4390             :                                                                 prio, &page);
    4391             : 
    4392           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    4393           0 :         psi_memstall_leave(&pflags);
    4394             :         delayacct_compact_end();
    4395             : 
    4396           0 :         if (*compact_result == COMPACT_SKIPPED)
    4397             :                 return NULL;
    4398             :         /*
    4399             :          * At least in one zone compaction wasn't deferred or skipped, so let's
    4400             :          * count a compaction stall
    4401             :          */
    4402           0 :         count_vm_event(COMPACTSTALL);
    4403             : 
    4404             :         /* Prep a captured page if available */
    4405           0 :         if (page)
    4406           0 :                 prep_new_page(page, order, gfp_mask, alloc_flags);
    4407             : 
    4408             :         /* Try get a page from the freelist if available */
    4409           0 :         if (!page)
    4410           0 :                 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4411             : 
    4412           0 :         if (page) {
    4413           0 :                 struct zone *zone = page_zone(page);
    4414             : 
    4415           0 :                 zone->compact_blockskip_flush = false;
    4416           0 :                 compaction_defer_reset(zone, order, true);
    4417           0 :                 count_vm_event(COMPACTSUCCESS);
    4418           0 :                 return page;
    4419             :         }
    4420             : 
    4421             :         /*
    4422             :          * It's bad if compaction run occurs and fails. The most likely reason
    4423             :          * is that pages exist, but not enough to satisfy watermarks.
    4424             :          */
    4425           0 :         count_vm_event(COMPACTFAIL);
    4426             : 
    4427           0 :         cond_resched();
    4428             : 
    4429           0 :         return NULL;
    4430             : }
    4431             : 
    4432             : static inline bool
    4433           0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
    4434             :                      enum compact_result compact_result,
    4435             :                      enum compact_priority *compact_priority,
    4436             :                      int *compaction_retries)
    4437             : {
    4438           0 :         int max_retries = MAX_COMPACT_RETRIES;
    4439             :         int min_priority;
    4440           0 :         bool ret = false;
    4441           0 :         int retries = *compaction_retries;
    4442           0 :         enum compact_priority priority = *compact_priority;
    4443             : 
    4444           0 :         if (!order)
    4445             :                 return false;
    4446             : 
    4447           0 :         if (fatal_signal_pending(current))
    4448             :                 return false;
    4449             : 
    4450           0 :         if (compaction_made_progress(compact_result))
    4451           0 :                 (*compaction_retries)++;
    4452             : 
    4453             :         /*
    4454             :          * compaction considers all the zone as desperately out of memory
    4455             :          * so it doesn't really make much sense to retry except when the
    4456             :          * failure could be caused by insufficient priority
    4457             :          */
    4458           0 :         if (compaction_failed(compact_result))
    4459             :                 goto check_priority;
    4460             : 
    4461             :         /*
    4462             :          * compaction was skipped because there are not enough order-0 pages
    4463             :          * to work with, so we retry only if it looks like reclaim can help.
    4464             :          */
    4465           0 :         if (compaction_needs_reclaim(compact_result)) {
    4466           0 :                 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
    4467           0 :                 goto out;
    4468             :         }
    4469             : 
    4470             :         /*
    4471             :          * make sure the compaction wasn't deferred or didn't bail out early
    4472             :          * due to locks contention before we declare that we should give up.
    4473             :          * But the next retry should use a higher priority if allowed, so
    4474             :          * we don't just keep bailing out endlessly.
    4475             :          */
    4476           0 :         if (compaction_withdrawn(compact_result)) {
    4477             :                 goto check_priority;
    4478             :         }
    4479             : 
    4480             :         /*
    4481             :          * !costly requests are much more important than __GFP_RETRY_MAYFAIL
    4482             :          * costly ones because they are de facto nofail and invoke OOM
    4483             :          * killer to move on while costly can fail and users are ready
    4484             :          * to cope with that. 1/4 retries is rather arbitrary but we
    4485             :          * would need much more detailed feedback from compaction to
    4486             :          * make a better decision.
    4487             :          */
    4488           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    4489           0 :                 max_retries /= 4;
    4490           0 :         if (*compaction_retries <= max_retries) {
    4491             :                 ret = true;
    4492             :                 goto out;
    4493             :         }
    4494             : 
    4495             :         /*
    4496             :          * Make sure there are attempts at the highest priority if we exhausted
    4497             :          * all retries or failed at the lower priorities.
    4498             :          */
    4499             : check_priority:
    4500           0 :         min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
    4501           0 :                         MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
    4502             : 
    4503           0 :         if (*compact_priority > min_priority) {
    4504           0 :                 (*compact_priority)--;
    4505           0 :                 *compaction_retries = 0;
    4506           0 :                 ret = true;
    4507             :         }
    4508             : out:
    4509           0 :         trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
    4510           0 :         return ret;
    4511             : }
    4512             : #else
    4513             : static inline struct page *
    4514             : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    4515             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4516             :                 enum compact_priority prio, enum compact_result *compact_result)
    4517             : {
    4518             :         *compact_result = COMPACT_SKIPPED;
    4519             :         return NULL;
    4520             : }
    4521             : 
    4522             : static inline bool
    4523             : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
    4524             :                      enum compact_result compact_result,
    4525             :                      enum compact_priority *compact_priority,
    4526             :                      int *compaction_retries)
    4527             : {
    4528             :         struct zone *zone;
    4529             :         struct zoneref *z;
    4530             : 
    4531             :         if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
    4532             :                 return false;
    4533             : 
    4534             :         /*
    4535             :          * There are setups with compaction disabled which would prefer to loop
    4536             :          * inside the allocator rather than hit the oom killer prematurely.
    4537             :          * Let's give them a good hope and keep retrying while the order-0
    4538             :          * watermarks are OK.
    4539             :          */
    4540             :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4541             :                                 ac->highest_zoneidx, ac->nodemask) {
    4542             :                 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
    4543             :                                         ac->highest_zoneidx, alloc_flags))
    4544             :                         return true;
    4545             :         }
    4546             :         return false;
    4547             : }
    4548             : #endif /* CONFIG_COMPACTION */
    4549             : 
    4550             : #ifdef CONFIG_LOCKDEP
    4551             : static struct lockdep_map __fs_reclaim_map =
    4552             :         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
    4553             : 
    4554             : static bool __need_reclaim(gfp_t gfp_mask)
    4555             : {
    4556             :         /* no reclaim without waiting on it */
    4557             :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
    4558             :                 return false;
    4559             : 
    4560             :         /* this guy won't enter reclaim */
    4561             :         if (current->flags & PF_MEMALLOC)
    4562             :                 return false;
    4563             : 
    4564             :         if (gfp_mask & __GFP_NOLOCKDEP)
    4565             :                 return false;
    4566             : 
    4567             :         return true;
    4568             : }
    4569             : 
    4570             : void __fs_reclaim_acquire(unsigned long ip)
    4571             : {
    4572             :         lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
    4573             : }
    4574             : 
    4575             : void __fs_reclaim_release(unsigned long ip)
    4576             : {
    4577             :         lock_release(&__fs_reclaim_map, ip);
    4578             : }
    4579             : 
    4580             : void fs_reclaim_acquire(gfp_t gfp_mask)
    4581             : {
    4582             :         gfp_mask = current_gfp_context(gfp_mask);
    4583             : 
    4584             :         if (__need_reclaim(gfp_mask)) {
    4585             :                 if (gfp_mask & __GFP_FS)
    4586             :                         __fs_reclaim_acquire(_RET_IP_);
    4587             : 
    4588             : #ifdef CONFIG_MMU_NOTIFIER
    4589             :                 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
    4590             :                 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
    4591             : #endif
    4592             : 
    4593             :         }
    4594             : }
    4595             : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
    4596             : 
    4597             : void fs_reclaim_release(gfp_t gfp_mask)
    4598             : {
    4599             :         gfp_mask = current_gfp_context(gfp_mask);
    4600             : 
    4601             :         if (__need_reclaim(gfp_mask)) {
    4602             :                 if (gfp_mask & __GFP_FS)
    4603             :                         __fs_reclaim_release(_RET_IP_);
    4604             :         }
    4605             : }
    4606             : EXPORT_SYMBOL_GPL(fs_reclaim_release);
    4607             : #endif
    4608             : 
    4609             : /* Perform direct synchronous page reclaim */
    4610             : static unsigned long
    4611           0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
    4612             :                                         const struct alloc_context *ac)
    4613             : {
    4614             :         unsigned int noreclaim_flag;
    4615             :         unsigned long progress;
    4616             : 
    4617           0 :         cond_resched();
    4618             : 
    4619             :         /* We now go into synchronous reclaim */
    4620             :         cpuset_memory_pressure_bump();
    4621           0 :         fs_reclaim_acquire(gfp_mask);
    4622           0 :         noreclaim_flag = memalloc_noreclaim_save();
    4623             : 
    4624           0 :         progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
    4625             :                                                                 ac->nodemask);
    4626             : 
    4627           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    4628           0 :         fs_reclaim_release(gfp_mask);
    4629             : 
    4630           0 :         cond_resched();
    4631             : 
    4632           0 :         return progress;
    4633             : }
    4634             : 
    4635             : /* The really slow allocator path where we enter direct reclaim */
    4636             : static inline struct page *
    4637           0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
    4638             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4639             :                 unsigned long *did_some_progress)
    4640             : {
    4641           0 :         struct page *page = NULL;
    4642             :         unsigned long pflags;
    4643           0 :         bool drained = false;
    4644             : 
    4645           0 :         psi_memstall_enter(&pflags);
    4646           0 :         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
    4647           0 :         if (unlikely(!(*did_some_progress)))
    4648             :                 goto out;
    4649             : 
    4650             : retry:
    4651           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4652             : 
    4653             :         /*
    4654             :          * If an allocation failed after direct reclaim, it could be because
    4655             :          * pages are pinned on the per-cpu lists or in high alloc reserves.
    4656             :          * Shrink them and try again
    4657             :          */
    4658           0 :         if (!page && !drained) {
    4659           0 :                 unreserve_highatomic_pageblock(ac, false);
    4660           0 :                 drain_all_pages(NULL);
    4661           0 :                 drained = true;
    4662           0 :                 goto retry;
    4663             :         }
    4664             : out:
    4665           0 :         psi_memstall_leave(&pflags);
    4666             : 
    4667           0 :         return page;
    4668             : }
    4669             : 
    4670           0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
    4671             :                              const struct alloc_context *ac)
    4672             : {
    4673             :         struct zoneref *z;
    4674             :         struct zone *zone;
    4675           0 :         pg_data_t *last_pgdat = NULL;
    4676           0 :         enum zone_type highest_zoneidx = ac->highest_zoneidx;
    4677             : 
    4678           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
    4679             :                                         ac->nodemask) {
    4680           0 :                 if (last_pgdat != zone->zone_pgdat)
    4681           0 :                         wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
    4682           0 :                 last_pgdat = zone->zone_pgdat;
    4683             :         }
    4684           0 : }
    4685             : 
    4686             : static inline unsigned int
    4687           0 : gfp_to_alloc_flags(gfp_t gfp_mask)
    4688             : {
    4689           0 :         unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    4690             : 
    4691             :         /*
    4692             :          * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
    4693             :          * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    4694             :          * to save two branches.
    4695             :          */
    4696             :         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
    4697             :         BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
    4698             : 
    4699             :         /*
    4700             :          * The caller may dip into page reserves a bit more if the caller
    4701             :          * cannot run direct reclaim, or if the caller has realtime scheduling
    4702             :          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
    4703             :          * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
    4704             :          */
    4705           0 :         alloc_flags |= (__force int)
    4706             :                 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
    4707             : 
    4708           0 :         if (gfp_mask & __GFP_ATOMIC) {
    4709             :                 /*
    4710             :                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
    4711             :                  * if it can't schedule.
    4712             :                  */
    4713           0 :                 if (!(gfp_mask & __GFP_NOMEMALLOC))
    4714           0 :                         alloc_flags |= ALLOC_HARDER;
    4715             :                 /*
    4716             :                  * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
    4717             :                  * comment for __cpuset_node_allowed().
    4718             :                  */
    4719           0 :                 alloc_flags &= ~ALLOC_CPUSET;
    4720           0 :         } else if (unlikely(rt_task(current)) && in_task())
    4721           0 :                 alloc_flags |= ALLOC_HARDER;
    4722             : 
    4723           0 :         alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
    4724             : 
    4725           0 :         return alloc_flags;
    4726             : }
    4727             : 
    4728             : static bool oom_reserves_allowed(struct task_struct *tsk)
    4729             : {
    4730           0 :         if (!tsk_is_oom_victim(tsk))
    4731             :                 return false;
    4732             : 
    4733             :         /*
    4734             :          * !MMU doesn't have oom reaper so give access to memory reserves
    4735             :          * only to the thread with TIF_MEMDIE set
    4736             :          */
    4737             :         if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
    4738             :                 return false;
    4739             : 
    4740             :         return true;
    4741             : }
    4742             : 
    4743             : /*
    4744             :  * Distinguish requests which really need access to full memory
    4745             :  * reserves from oom victims which can live with a portion of it
    4746             :  */
    4747           0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
    4748             : {
    4749           0 :         if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
    4750             :                 return 0;
    4751           0 :         if (gfp_mask & __GFP_MEMALLOC)
    4752             :                 return ALLOC_NO_WATERMARKS;
    4753           0 :         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
    4754             :                 return ALLOC_NO_WATERMARKS;
    4755           0 :         if (!in_interrupt()) {
    4756           0 :                 if (current->flags & PF_MEMALLOC)
    4757             :                         return ALLOC_NO_WATERMARKS;
    4758           0 :                 else if (oom_reserves_allowed(current))
    4759             :                         return ALLOC_OOM;
    4760             :         }
    4761             : 
    4762             :         return 0;
    4763             : }
    4764             : 
    4765           0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
    4766             : {
    4767           0 :         return !!__gfp_pfmemalloc_flags(gfp_mask);
    4768             : }
    4769             : 
    4770             : /*
    4771             :  * Checks whether it makes sense to retry the reclaim to make a forward progress
    4772             :  * for the given allocation request.
    4773             :  *
    4774             :  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
    4775             :  * without success, or when we couldn't even meet the watermark if we
    4776             :  * reclaimed all remaining pages on the LRU lists.
    4777             :  *
    4778             :  * Returns true if a retry is viable or false to enter the oom path.
    4779             :  */
    4780             : static inline bool
    4781           0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
    4782             :                      struct alloc_context *ac, int alloc_flags,
    4783             :                      bool did_some_progress, int *no_progress_loops)
    4784             : {
    4785             :         struct zone *zone;
    4786             :         struct zoneref *z;
    4787           0 :         bool ret = false;
    4788             : 
    4789             :         /*
    4790             :          * Costly allocations might have made a progress but this doesn't mean
    4791             :          * their order will become available due to high fragmentation so
    4792             :          * always increment the no progress counter for them
    4793             :          */
    4794           0 :         if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
    4795           0 :                 *no_progress_loops = 0;
    4796             :         else
    4797           0 :                 (*no_progress_loops)++;
    4798             : 
    4799             :         /*
    4800             :          * Make sure we converge to OOM if we cannot make any progress
    4801             :          * several times in the row.
    4802             :          */
    4803           0 :         if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
    4804             :                 /* Before OOM, exhaust highatomic_reserve */
    4805           0 :                 return unreserve_highatomic_pageblock(ac, true);
    4806             :         }
    4807             : 
    4808             :         /*
    4809             :          * Keep reclaiming pages while there is a chance this will lead
    4810             :          * somewhere.  If none of the target zones can satisfy our allocation
    4811             :          * request even if all reclaimable pages are considered then we are
    4812             :          * screwed and have to go OOM.
    4813             :          */
    4814           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4815             :                                 ac->highest_zoneidx, ac->nodemask) {
    4816             :                 unsigned long available;
    4817             :                 unsigned long reclaimable;
    4818           0 :                 unsigned long min_wmark = min_wmark_pages(zone);
    4819             :                 bool wmark;
    4820             : 
    4821           0 :                 available = reclaimable = zone_reclaimable_pages(zone);
    4822           0 :                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
    4823             : 
    4824             :                 /*
    4825             :                  * Would the allocation succeed if we reclaimed all
    4826             :                  * reclaimable pages?
    4827             :                  */
    4828           0 :                 wmark = __zone_watermark_ok(zone, order, min_wmark,
    4829           0 :                                 ac->highest_zoneidx, alloc_flags, available);
    4830           0 :                 trace_reclaim_retry_zone(z, order, reclaimable,
    4831             :                                 available, min_wmark, *no_progress_loops, wmark);
    4832           0 :                 if (wmark) {
    4833             :                         ret = true;
    4834             :                         break;
    4835             :                 }
    4836             :         }
    4837             : 
    4838             :         /*
    4839             :          * Memory allocation/reclaim might be called from a WQ context and the
    4840             :          * current implementation of the WQ concurrency control doesn't
    4841             :          * recognize that a particular WQ is congested if the worker thread is
    4842             :          * looping without ever sleeping. Therefore we have to do a short sleep
    4843             :          * here rather than calling cond_resched().
    4844             :          */
    4845           0 :         if (current->flags & PF_WQ_WORKER)
    4846           0 :                 schedule_timeout_uninterruptible(1);
    4847             :         else
    4848           0 :                 cond_resched();
    4849             :         return ret;
    4850             : }
    4851             : 
    4852             : static inline bool
    4853             : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
    4854             : {
    4855             :         /*
    4856             :          * It's possible that cpuset's mems_allowed and the nodemask from
    4857             :          * mempolicy don't intersect. This should be normally dealt with by
    4858             :          * policy_nodemask(), but it's possible to race with cpuset update in
    4859             :          * such a way the check therein was true, and then it became false
    4860             :          * before we got our cpuset_mems_cookie here.
    4861             :          * This assumes that for all allocations, ac->nodemask can come only
    4862             :          * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
    4863             :          * when it does not intersect with the cpuset restrictions) or the
    4864             :          * caller can deal with a violated nodemask.
    4865             :          */
    4866             :         if (cpusets_enabled() && ac->nodemask &&
    4867             :                         !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
    4868             :                 ac->nodemask = NULL;
    4869             :                 return true;
    4870             :         }
    4871             : 
    4872             :         /*
    4873             :          * When updating a task's mems_allowed or mempolicy nodemask, it is
    4874             :          * possible to race with parallel threads in such a way that our
    4875             :          * allocation can fail while the mask is being updated. If we are about
    4876             :          * to fail, check if the cpuset changed during allocation and if so,
    4877             :          * retry.
    4878             :          */
    4879           0 :         if (read_mems_allowed_retry(cpuset_mems_cookie))
    4880             :                 return true;
    4881             : 
    4882             :         return false;
    4883             : }
    4884             : 
    4885             : static inline struct page *
    4886           0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    4887             :                                                 struct alloc_context *ac)
    4888             : {
    4889           0 :         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
    4890           0 :         const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
    4891           0 :         struct page *page = NULL;
    4892             :         unsigned int alloc_flags;
    4893             :         unsigned long did_some_progress;
    4894             :         enum compact_priority compact_priority;
    4895             :         enum compact_result compact_result;
    4896             :         int compaction_retries;
    4897             :         int no_progress_loops;
    4898             :         unsigned int cpuset_mems_cookie;
    4899             :         int reserve_flags;
    4900             : 
    4901             :         /*
    4902             :          * We also sanity check to catch abuse of atomic reserves being used by
    4903             :          * callers that are not in atomic context.
    4904             :          */
    4905           0 :         if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
    4906             :                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
    4907           0 :                 gfp_mask &= ~__GFP_ATOMIC;
    4908             : 
    4909             : retry_cpuset:
    4910           0 :         compaction_retries = 0;
    4911           0 :         no_progress_loops = 0;
    4912           0 :         compact_priority = DEF_COMPACT_PRIORITY;
    4913           0 :         cpuset_mems_cookie = read_mems_allowed_begin();
    4914             : 
    4915             :         /*
    4916             :          * The fast path uses conservative alloc_flags to succeed only until
    4917             :          * kswapd needs to be woken up, and to avoid the cost of setting up
    4918             :          * alloc_flags precisely. So we do that now.
    4919             :          */
    4920           0 :         alloc_flags = gfp_to_alloc_flags(gfp_mask);
    4921             : 
    4922             :         /*
    4923             :          * We need to recalculate the starting point for the zonelist iterator
    4924             :          * because we might have used different nodemask in the fast path, or
    4925             :          * there was a cpuset modification and we are retrying - otherwise we
    4926             :          * could end up iterating over non-eligible zones endlessly.
    4927             :          */
    4928           0 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4929             :                                         ac->highest_zoneidx, ac->nodemask);
    4930           0 :         if (!ac->preferred_zoneref->zone)
    4931             :                 goto nopage;
    4932             : 
    4933             :         /*
    4934             :          * Check for insane configurations where the cpuset doesn't contain
    4935             :          * any suitable zone to satisfy the request - e.g. non-movable
    4936             :          * GFP_HIGHUSER allocations from MOVABLE nodes only.
    4937             :          */
    4938             :         if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
    4939             :                 struct zoneref *z = first_zones_zonelist(ac->zonelist,
    4940             :                                         ac->highest_zoneidx,
    4941             :                                         &cpuset_current_mems_allowed);
    4942             :                 if (!z->zone)
    4943             :                         goto nopage;
    4944             :         }
    4945             : 
    4946           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4947           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4948             : 
    4949             :         /*
    4950             :          * The adjusted alloc_flags might result in immediate success, so try
    4951             :          * that first
    4952             :          */
    4953           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4954           0 :         if (page)
    4955             :                 goto got_pg;
    4956             : 
    4957             :         /*
    4958             :          * For costly allocations, try direct compaction first, as it's likely
    4959             :          * that we have enough base pages and don't need to reclaim. For non-
    4960             :          * movable high-order allocations, do that as well, as compaction will
    4961             :          * try prevent permanent fragmentation by migrating from blocks of the
    4962             :          * same migratetype.
    4963             :          * Don't try this for allocations that are allowed to ignore
    4964             :          * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
    4965             :          */
    4966           0 :         if (can_direct_reclaim &&
    4967           0 :                         (costly_order ||
    4968           0 :                            (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
    4969           0 :                         && !gfp_pfmemalloc_allowed(gfp_mask)) {
    4970           0 :                 page = __alloc_pages_direct_compact(gfp_mask, order,
    4971             :                                                 alloc_flags, ac,
    4972             :                                                 INIT_COMPACT_PRIORITY,
    4973             :                                                 &compact_result);
    4974           0 :                 if (page)
    4975             :                         goto got_pg;
    4976             : 
    4977             :                 /*
    4978             :                  * Checks for costly allocations with __GFP_NORETRY, which
    4979             :                  * includes some THP page fault allocations
    4980             :                  */
    4981           0 :                 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
    4982             :                         /*
    4983             :                          * If allocating entire pageblock(s) and compaction
    4984             :                          * failed because all zones are below low watermarks
    4985             :                          * or is prohibited because it recently failed at this
    4986             :                          * order, fail immediately unless the allocator has
    4987             :                          * requested compaction and reclaim retry.
    4988             :                          *
    4989             :                          * Reclaim is
    4990             :                          *  - potentially very expensive because zones are far
    4991             :                          *    below their low watermarks or this is part of very
    4992             :                          *    bursty high order allocations,
    4993             :                          *  - not guaranteed to help because isolate_freepages()
    4994             :                          *    may not iterate over freed pages as part of its
    4995             :                          *    linear scan, and
    4996             :                          *  - unlikely to make entire pageblocks free on its
    4997             :                          *    own.
    4998             :                          */
    4999           0 :                         if (compact_result == COMPACT_SKIPPED ||
    5000             :                             compact_result == COMPACT_DEFERRED)
    5001             :                                 goto nopage;
    5002             : 
    5003             :                         /*
    5004             :                          * Looks like reclaim/compaction is worth trying, but
    5005             :                          * sync compaction could be very expensive, so keep
    5006             :                          * using async compaction.
    5007             :                          */
    5008           0 :                         compact_priority = INIT_COMPACT_PRIORITY;
    5009             :                 }
    5010             :         }
    5011             : 
    5012             : retry:
    5013             :         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    5014           0 :         if (alloc_flags & ALLOC_KSWAPD)
    5015           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    5016             : 
    5017           0 :         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
    5018           0 :         if (reserve_flags)
    5019           0 :                 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
    5020             : 
    5021             :         /*
    5022             :          * Reset the nodemask and zonelist iterators if memory policies can be
    5023             :          * ignored. These allocations are high priority and system rather than
    5024             :          * user oriented.
    5025             :          */
    5026           0 :         if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
    5027           0 :                 ac->nodemask = NULL;
    5028           0 :                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    5029             :                                         ac->highest_zoneidx, ac->nodemask);
    5030             :         }
    5031             : 
    5032             :         /* Attempt with potentially adjusted zonelist and alloc_flags */
    5033           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    5034           0 :         if (page)
    5035             :                 goto got_pg;
    5036             : 
    5037             :         /* Caller is not willing to reclaim, we can't balance anything */
    5038           0 :         if (!can_direct_reclaim)
    5039             :                 goto nopage;
    5040             : 
    5041             :         /* Avoid recursion of direct reclaim */
    5042           0 :         if (current->flags & PF_MEMALLOC)
    5043             :                 goto nopage;
    5044             : 
    5045             :         /* Try direct reclaim and then allocating */
    5046           0 :         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
    5047             :                                                         &did_some_progress);
    5048           0 :         if (page)
    5049             :                 goto got_pg;
    5050             : 
    5051             :         /* Try direct compaction and then allocating */
    5052           0 :         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
    5053             :                                         compact_priority, &compact_result);
    5054           0 :         if (page)
    5055             :                 goto got_pg;
    5056             : 
    5057             :         /* Do not loop if specifically requested */
    5058           0 :         if (gfp_mask & __GFP_NORETRY)
    5059             :                 goto nopage;
    5060             : 
    5061             :         /*
    5062             :          * Do not retry costly high order allocations unless they are
    5063             :          * __GFP_RETRY_MAYFAIL
    5064             :          */
    5065           0 :         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
    5066             :                 goto nopage;
    5067             : 
    5068           0 :         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
    5069             :                                  did_some_progress > 0, &no_progress_loops))
    5070             :                 goto retry;
    5071             : 
    5072             :         /*
    5073             :          * It doesn't make any sense to retry for the compaction if the order-0
    5074             :          * reclaim is not able to make any progress because the current
    5075             :          * implementation of the compaction depends on the sufficient amount
    5076             :          * of free memory (see __compaction_suitable)
    5077             :          */
    5078           0 :         if (did_some_progress > 0 &&
    5079           0 :                         should_compact_retry(ac, order, alloc_flags,
    5080             :                                 compact_result, &compact_priority,
    5081             :                                 &compaction_retries))
    5082             :                 goto retry;
    5083             : 
    5084             : 
    5085             :         /* Deal with possible cpuset update races before we start OOM killing */
    5086           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac))
    5087             :                 goto retry_cpuset;
    5088             : 
    5089             :         /* Reclaim has failed us, start killing things */
    5090           0 :         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
    5091           0 :         if (page)
    5092             :                 goto got_pg;
    5093             : 
    5094             :         /* Avoid allocations with no watermarks from looping endlessly */
    5095           0 :         if (tsk_is_oom_victim(current) &&
    5096           0 :             (alloc_flags & ALLOC_OOM ||
    5097           0 :              (gfp_mask & __GFP_NOMEMALLOC)))
    5098             :                 goto nopage;
    5099             : 
    5100             :         /* Retry as long as the OOM killer is making progress */
    5101           0 :         if (did_some_progress) {
    5102           0 :                 no_progress_loops = 0;
    5103           0 :                 goto retry;
    5104             :         }
    5105             : 
    5106             : nopage:
    5107             :         /* Deal with possible cpuset update races before we fail */
    5108           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac))
    5109             :                 goto retry_cpuset;
    5110             : 
    5111             :         /*
    5112             :          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
    5113             :          * we always retry
    5114             :          */
    5115           0 :         if (gfp_mask & __GFP_NOFAIL) {
    5116             :                 /*
    5117             :                  * All existing users of the __GFP_NOFAIL are blockable, so warn
    5118             :                  * of any new users that actually require GFP_NOWAIT
    5119             :                  */
    5120           0 :                 if (WARN_ON_ONCE(!can_direct_reclaim))
    5121             :                         goto fail;
    5122             : 
    5123             :                 /*
    5124             :                  * PF_MEMALLOC request from this context is rather bizarre
    5125             :                  * because we cannot reclaim anything and only can loop waiting
    5126             :                  * for somebody to do a work for us
    5127             :                  */
    5128           0 :                 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
    5129             : 
    5130             :                 /*
    5131             :                  * non failing costly orders are a hard requirement which we
    5132             :                  * are not prepared for much so let's warn about these users
    5133             :                  * so that we can identify them and convert them to something
    5134             :                  * else.
    5135             :                  */
    5136           0 :                 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
    5137             : 
    5138             :                 /*
    5139             :                  * Help non-failing allocations by giving them access to memory
    5140             :                  * reserves but do not use ALLOC_NO_WATERMARKS because this
    5141             :                  * could deplete whole memory reserves which would just make
    5142             :                  * the situation worse
    5143             :                  */
    5144           0 :                 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
    5145           0 :                 if (page)
    5146             :                         goto got_pg;
    5147             : 
    5148           0 :                 cond_resched();
    5149           0 :                 goto retry;
    5150             :         }
    5151             : fail:
    5152           0 :         warn_alloc(gfp_mask, ac->nodemask,
    5153             :                         "page allocation failure: order:%u", order);
    5154             : got_pg:
    5155           0 :         return page;
    5156             : }
    5157             : 
    5158         483 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
    5159             :                 int preferred_nid, nodemask_t *nodemask,
    5160             :                 struct alloc_context *ac, gfp_t *alloc_gfp,
    5161             :                 unsigned int *alloc_flags)
    5162             : {
    5163         483 :         ac->highest_zoneidx = gfp_zone(gfp_mask);
    5164         966 :         ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
    5165         483 :         ac->nodemask = nodemask;
    5166         483 :         ac->migratetype = gfp_migratetype(gfp_mask);
    5167             : 
    5168             :         if (cpusets_enabled()) {
    5169             :                 *alloc_gfp |= __GFP_HARDWALL;
    5170             :                 /*
    5171             :                  * When we are in the interrupt context, it is irrelevant
    5172             :                  * to the current task context. It means that any node ok.
    5173             :                  */
    5174             :                 if (in_task() && !ac->nodemask)
    5175             :                         ac->nodemask = &cpuset_current_mems_allowed;
    5176             :                 else
    5177             :                         *alloc_flags |= ALLOC_CPUSET;
    5178             :         }
    5179             : 
    5180         483 :         fs_reclaim_acquire(gfp_mask);
    5181         483 :         fs_reclaim_release(gfp_mask);
    5182             : 
    5183             :         might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
    5184             : 
    5185         483 :         if (should_fail_alloc_page(gfp_mask, order))
    5186             :                 return false;
    5187             : 
    5188         483 :         *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
    5189             : 
    5190             :         /* Dirty zone balancing only done in the fast path */
    5191         483 :         ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
    5192             : 
    5193             :         /*
    5194             :          * The preferred zone is used for statistics but crucially it is
    5195             :          * also used as the starting point for the zonelist iterator. It
    5196             :          * may get reset for allocations that ignore memory policies.
    5197             :          */
    5198         966 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    5199             :                                         ac->highest_zoneidx, ac->nodemask);
    5200             : 
    5201             :         return true;
    5202             : }
    5203             : 
    5204             : /*
    5205             :  * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
    5206             :  * @gfp: GFP flags for the allocation
    5207             :  * @preferred_nid: The preferred NUMA node ID to allocate from
    5208             :  * @nodemask: Set of nodes to allocate from, may be NULL
    5209             :  * @nr_pages: The number of pages desired on the list or array
    5210             :  * @page_list: Optional list to store the allocated pages
    5211             :  * @page_array: Optional array to store the pages
    5212             :  *
    5213             :  * This is a batched version of the page allocator that attempts to
    5214             :  * allocate nr_pages quickly. Pages are added to page_list if page_list
    5215             :  * is not NULL, otherwise it is assumed that the page_array is valid.
    5216             :  *
    5217             :  * For lists, nr_pages is the number of pages that should be allocated.
    5218             :  *
    5219             :  * For arrays, only NULL elements are populated with pages and nr_pages
    5220             :  * is the maximum number of pages that will be stored in the array.
    5221             :  *
    5222             :  * Returns the number of pages on the list or array.
    5223             :  */
    5224          15 : unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
    5225             :                         nodemask_t *nodemask, int nr_pages,
    5226             :                         struct list_head *page_list,
    5227             :                         struct page **page_array)
    5228             : {
    5229             :         struct page *page;
    5230             :         unsigned long flags;
    5231             :         struct zone *zone;
    5232             :         struct zoneref *z;
    5233             :         struct per_cpu_pages *pcp;
    5234             :         struct list_head *pcp_list;
    5235             :         struct alloc_context ac;
    5236             :         gfp_t alloc_gfp;
    5237          15 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    5238          15 :         int nr_populated = 0, nr_account = 0;
    5239             : 
    5240             :         /*
    5241             :          * Skip populated array elements to determine if any pages need
    5242             :          * to be allocated before disabling IRQs.
    5243             :          */
    5244          30 :         while (page_array && nr_populated < nr_pages && page_array[nr_populated])
    5245           0 :                 nr_populated++;
    5246             : 
    5247             :         /* No pages requested? */
    5248          15 :         if (unlikely(nr_pages <= 0))
    5249             :                 goto out;
    5250             : 
    5251             :         /* Already populated array? */
    5252          15 :         if (unlikely(page_array && nr_pages - nr_populated == 0))
    5253             :                 goto out;
    5254             : 
    5255             :         /* Bulk allocator does not support memcg accounting. */
    5256             :         if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT))
    5257             :                 goto failed;
    5258             : 
    5259             :         /* Use the single page allocator for one page. */
    5260          15 :         if (nr_pages - nr_populated == 1)
    5261             :                 goto failed;
    5262             : 
    5263             : #ifdef CONFIG_PAGE_OWNER
    5264             :         /*
    5265             :          * PAGE_OWNER may recurse into the allocator to allocate space to
    5266             :          * save the stack with pagesets.lock held. Releasing/reacquiring
    5267             :          * removes much of the performance benefit of bulk allocation so
    5268             :          * force the caller to allocate one page at a time as it'll have
    5269             :          * similar performance to added complexity to the bulk allocator.
    5270             :          */
    5271             :         if (static_branch_unlikely(&page_owner_inited))
    5272             :                 goto failed;
    5273             : #endif
    5274             : 
    5275             :         /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
    5276          15 :         gfp &= gfp_allowed_mask;
    5277          15 :         alloc_gfp = gfp;
    5278          15 :         if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
    5279             :                 goto out;
    5280          15 :         gfp = alloc_gfp;
    5281             : 
    5282             :         /* Find an allowed local zone that meets the low watermark. */
    5283          30 :         for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
    5284             :                 unsigned long mark;
    5285             : 
    5286             :                 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
    5287             :                     !__cpuset_zone_allowed(zone, gfp)) {
    5288             :                         continue;
    5289             :                 }
    5290             : 
    5291             :                 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
    5292             :                     zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
    5293             :                         goto failed;
    5294             :                 }
    5295             : 
    5296          15 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
    5297          15 :                 if (zone_watermark_fast(zone, 0,  mark,
    5298             :                                 zonelist_zone_idx(ac.preferred_zoneref),
    5299             :                                 alloc_flags, gfp)) {
    5300             :                         break;
    5301             :                 }
    5302             :         }
    5303             : 
    5304             :         /*
    5305             :          * If there are no allowed local zones that meets the watermarks then
    5306             :          * try to allocate a single page and reclaim if necessary.
    5307             :          */
    5308          15 :         if (unlikely(!zone))
    5309             :                 goto failed;
    5310             : 
    5311             :         /* Attempt the batch allocation */
    5312          15 :         local_lock_irqsave(&pagesets.lock, flags);
    5313          15 :         pcp = this_cpu_ptr(zone->per_cpu_pageset);
    5314          30 :         pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
    5315             : 
    5316          90 :         while (nr_populated < nr_pages) {
    5317             : 
    5318             :                 /* Skip existing pages */
    5319          60 :                 if (page_array && page_array[nr_populated]) {
    5320           0 :                         nr_populated++;
    5321           0 :                         continue;
    5322             :                 }
    5323             : 
    5324          60 :                 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
    5325             :                                                                 pcp, pcp_list);
    5326          60 :                 if (unlikely(!page)) {
    5327             :                         /* Try and get at least one page */
    5328           0 :                         if (!nr_populated)
    5329             :                                 goto failed_irq;
    5330             :                         break;
    5331             :                 }
    5332          60 :                 nr_account++;
    5333             : 
    5334          60 :                 prep_new_page(page, 0, gfp, 0);
    5335          60 :                 if (page_list)
    5336           0 :                         list_add(&page->lru, page_list);
    5337             :                 else
    5338          60 :                         page_array[nr_populated] = page;
    5339          60 :                 nr_populated++;
    5340             :         }
    5341             : 
    5342          30 :         local_unlock_irqrestore(&pagesets.lock, flags);
    5343             : 
    5344          30 :         __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
    5345          15 :         zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
    5346             : 
    5347             : out:
    5348          15 :         return nr_populated;
    5349             : 
    5350             : failed_irq:
    5351           0 :         local_unlock_irqrestore(&pagesets.lock, flags);
    5352             : 
    5353             : failed:
    5354           0 :         page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
    5355           0 :         if (page) {
    5356           0 :                 if (page_list)
    5357           0 :                         list_add(&page->lru, page_list);
    5358             :                 else
    5359           0 :                         page_array[nr_populated] = page;
    5360           0 :                 nr_populated++;
    5361             :         }
    5362             : 
    5363             :         goto out;
    5364             : }
    5365             : EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
    5366             : 
    5367             : /*
    5368             :  * This is the 'heart' of the zoned buddy allocator.
    5369             :  */
    5370         468 : struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
    5371             :                                                         nodemask_t *nodemask)
    5372             : {
    5373             :         struct page *page;
    5374         468 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    5375             :         gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
    5376         468 :         struct alloc_context ac = { };
    5377             : 
    5378             :         /*
    5379             :          * There are several places where we assume that the order value is sane
    5380             :          * so bail out early if the request is out of bound.
    5381             :          */
    5382         468 :         if (unlikely(order >= MAX_ORDER)) {
    5383           0 :                 WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
    5384             :                 return NULL;
    5385             :         }
    5386             : 
    5387         468 :         gfp &= gfp_allowed_mask;
    5388             :         /*
    5389             :          * Apply scoped allocation constraints. This is mainly about GFP_NOFS
    5390             :          * resp. GFP_NOIO which has to be inherited for all allocation requests
    5391             :          * from a particular context which has been marked by
    5392             :          * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
    5393             :          * movable zones are not used during allocation.
    5394             :          */
    5395         468 :         gfp = current_gfp_context(gfp);
    5396         468 :         alloc_gfp = gfp;
    5397         468 :         if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
    5398             :                         &alloc_gfp, &alloc_flags))
    5399             :                 return NULL;
    5400             : 
    5401             :         /*
    5402             :          * Forbid the first pass from falling back to types that fragment
    5403             :          * memory until all local zones are considered.
    5404             :          */
    5405         936 :         alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
    5406             : 
    5407             :         /* First allocation attempt */
    5408         468 :         page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
    5409         468 :         if (likely(page))
    5410             :                 goto out;
    5411             : 
    5412           0 :         alloc_gfp = gfp;
    5413           0 :         ac.spread_dirty_pages = false;
    5414             : 
    5415             :         /*
    5416             :          * Restore the original nodemask if it was potentially replaced with
    5417             :          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
    5418             :          */
    5419           0 :         ac.nodemask = nodemask;
    5420             : 
    5421           0 :         page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
    5422             : 
    5423             : out:
    5424             :         if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
    5425             :             unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
    5426             :                 __free_pages(page, order);
    5427             :                 page = NULL;
    5428             :         }
    5429             : 
    5430         468 :         trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
    5431             : 
    5432         468 :         return page;
    5433             : }
    5434             : EXPORT_SYMBOL(__alloc_pages);
    5435             : 
    5436           0 : struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
    5437             :                 nodemask_t *nodemask)
    5438             : {
    5439           0 :         struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
    5440             :                         preferred_nid, nodemask);
    5441             : 
    5442             :         if (page && order > 1)
    5443             :                 prep_transhuge_page(page);
    5444           0 :         return (struct folio *)page;
    5445             : }
    5446             : EXPORT_SYMBOL(__folio_alloc);
    5447             : 
    5448             : /*
    5449             :  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
    5450             :  * address cannot represent highmem pages. Use alloc_pages and then kmap if
    5451             :  * you need to access high mem.
    5452             :  */
    5453           4 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
    5454             : {
    5455             :         struct page *page;
    5456             : 
    5457           8 :         page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
    5458           4 :         if (!page)
    5459             :                 return 0;
    5460           4 :         return (unsigned long) page_address(page);
    5461             : }
    5462             : EXPORT_SYMBOL(__get_free_pages);
    5463             : 
    5464           0 : unsigned long get_zeroed_page(gfp_t gfp_mask)
    5465             : {
    5466           0 :         return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
    5467             : }
    5468             : EXPORT_SYMBOL(get_zeroed_page);
    5469             : 
    5470             : /**
    5471             :  * __free_pages - Free pages allocated with alloc_pages().
    5472             :  * @page: The page pointer returned from alloc_pages().
    5473             :  * @order: The order of the allocation.
    5474             :  *
    5475             :  * This function can free multi-page allocations that are not compound
    5476             :  * pages.  It does not check that the @order passed in matches that of
    5477             :  * the allocation, so it is easy to leak memory.  Freeing more memory
    5478             :  * than was allocated will probably emit a warning.
    5479             :  *
    5480             :  * If the last reference to this page is speculative, it will be released
    5481             :  * by put_page() which only frees the first page of a non-compound
    5482             :  * allocation.  To prevent the remaining pages from being leaked, we free
    5483             :  * the subsequent pages here.  If you want to use the page's reference
    5484             :  * count to decide when to free the allocation, you should allocate a
    5485             :  * compound page, and use put_page() instead of __free_pages().
    5486             :  *
    5487             :  * Context: May be called in interrupt context or while holding a normal
    5488             :  * spinlock, but not in NMI context or while holding a raw spinlock.
    5489             :  */
    5490          11 : void __free_pages(struct page *page, unsigned int order)
    5491             : {
    5492          11 :         if (put_page_testzero(page))
    5493          11 :                 free_the_page(page, order);
    5494           0 :         else if (!PageHead(page))
    5495           0 :                 while (order-- > 0)
    5496           0 :                         free_the_page(page + (1 << order), order);
    5497          11 : }
    5498             : EXPORT_SYMBOL(__free_pages);
    5499             : 
    5500           0 : void free_pages(unsigned long addr, unsigned int order)
    5501             : {
    5502           0 :         if (addr != 0) {
    5503             :                 VM_BUG_ON(!virt_addr_valid((void *)addr));
    5504           0 :                 __free_pages(virt_to_page((void *)addr), order);
    5505             :         }
    5506           0 : }
    5507             : 
    5508             : EXPORT_SYMBOL(free_pages);
    5509             : 
    5510             : /*
    5511             :  * Page Fragment:
    5512             :  *  An arbitrary-length arbitrary-offset area of memory which resides
    5513             :  *  within a 0 or higher order page.  Multiple fragments within that page
    5514             :  *  are individually refcounted, in the page's reference counter.
    5515             :  *
    5516             :  * The page_frag functions below provide a simple allocation framework for
    5517             :  * page fragments.  This is used by the network stack and network device
    5518             :  * drivers to provide a backing region of memory for use as either an
    5519             :  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
    5520             :  */
    5521           0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
    5522             :                                              gfp_t gfp_mask)
    5523             : {
    5524           0 :         struct page *page = NULL;
    5525           0 :         gfp_t gfp = gfp_mask;
    5526             : 
    5527             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5528           0 :         gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
    5529             :                     __GFP_NOMEMALLOC;
    5530           0 :         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
    5531           0 :                                 PAGE_FRAG_CACHE_MAX_ORDER);
    5532           0 :         nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
    5533             : #endif
    5534           0 :         if (unlikely(!page))
    5535           0 :                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
    5536             : 
    5537           0 :         nc->va = page ? page_address(page) : NULL;
    5538             : 
    5539           0 :         return page;
    5540             : }
    5541             : 
    5542           0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
    5543             : {
    5544             :         VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
    5545             : 
    5546           0 :         if (page_ref_sub_and_test(page, count))
    5547           0 :                 free_the_page(page, compound_order(page));
    5548           0 : }
    5549             : EXPORT_SYMBOL(__page_frag_cache_drain);
    5550             : 
    5551           0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
    5552             :                       unsigned int fragsz, gfp_t gfp_mask,
    5553             :                       unsigned int align_mask)
    5554             : {
    5555           0 :         unsigned int size = PAGE_SIZE;
    5556             :         struct page *page;
    5557             :         int offset;
    5558             : 
    5559           0 :         if (unlikely(!nc->va)) {
    5560             : refill:
    5561           0 :                 page = __page_frag_cache_refill(nc, gfp_mask);
    5562           0 :                 if (!page)
    5563             :                         return NULL;
    5564             : 
    5565             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5566             :                 /* if size can vary use size else just use PAGE_SIZE */
    5567           0 :                 size = nc->size;
    5568             : #endif
    5569             :                 /* Even if we own the page, we do not use atomic_set().
    5570             :                  * This would break get_page_unless_zero() users.
    5571             :                  */
    5572           0 :                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
    5573             : 
    5574             :                 /* reset page count bias and offset to start of new frag */
    5575           0 :                 nc->pfmemalloc = page_is_pfmemalloc(page);
    5576           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    5577           0 :                 nc->offset = size;
    5578             :         }
    5579             : 
    5580           0 :         offset = nc->offset - fragsz;
    5581           0 :         if (unlikely(offset < 0)) {
    5582           0 :                 page = virt_to_page(nc->va);
    5583             : 
    5584           0 :                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
    5585             :                         goto refill;
    5586             : 
    5587           0 :                 if (unlikely(nc->pfmemalloc)) {
    5588           0 :                         free_the_page(page, compound_order(page));
    5589           0 :                         goto refill;
    5590             :                 }
    5591             : 
    5592             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5593             :                 /* if size can vary use size else just use PAGE_SIZE */
    5594           0 :                 size = nc->size;
    5595             : #endif
    5596             :                 /* OK, page count is 0, we can safely set it */
    5597           0 :                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
    5598             : 
    5599             :                 /* reset page count bias and offset to start of new frag */
    5600           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    5601           0 :                 offset = size - fragsz;
    5602             :         }
    5603             : 
    5604           0 :         nc->pagecnt_bias--;
    5605           0 :         offset &= align_mask;
    5606           0 :         nc->offset = offset;
    5607             : 
    5608           0 :         return nc->va + offset;
    5609             : }
    5610             : EXPORT_SYMBOL(page_frag_alloc_align);
    5611             : 
    5612             : /*
    5613             :  * Frees a page fragment allocated out of either a compound or order 0 page.
    5614             :  */
    5615           0 : void page_frag_free(void *addr)
    5616             : {
    5617           0 :         struct page *page = virt_to_head_page(addr);
    5618             : 
    5619           0 :         if (unlikely(put_page_testzero(page)))
    5620           0 :                 free_the_page(page, compound_order(page));
    5621           0 : }
    5622             : EXPORT_SYMBOL(page_frag_free);
    5623             : 
    5624           3 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
    5625             :                 size_t size)
    5626             : {
    5627           3 :         if (addr) {
    5628           3 :                 unsigned long alloc_end = addr + (PAGE_SIZE << order);
    5629           3 :                 unsigned long used = addr + PAGE_ALIGN(size);
    5630             : 
    5631           6 :                 split_page(virt_to_page((void *)addr), order);
    5632           3 :                 while (used < alloc_end) {
    5633           0 :                         free_page(used);
    5634           0 :                         used += PAGE_SIZE;
    5635             :                 }
    5636             :         }
    5637           3 :         return (void *)addr;
    5638             : }
    5639             : 
    5640             : /**
    5641             :  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
    5642             :  * @size: the number of bytes to allocate
    5643             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5644             :  *
    5645             :  * This function is similar to alloc_pages(), except that it allocates the
    5646             :  * minimum number of pages to satisfy the request.  alloc_pages() can only
    5647             :  * allocate memory in power-of-two pages.
    5648             :  *
    5649             :  * This function is also limited by MAX_ORDER.
    5650             :  *
    5651             :  * Memory allocated by this function must be released by free_pages_exact().
    5652             :  *
    5653             :  * Return: pointer to the allocated area or %NULL in case of error.
    5654             :  */
    5655           3 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
    5656             : {
    5657           3 :         unsigned int order = get_order(size);
    5658             :         unsigned long addr;
    5659             : 
    5660           3 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    5661           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    5662             : 
    5663           3 :         addr = __get_free_pages(gfp_mask, order);
    5664           3 :         return make_alloc_exact(addr, order, size);
    5665             : }
    5666             : EXPORT_SYMBOL(alloc_pages_exact);
    5667             : 
    5668             : /**
    5669             :  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
    5670             :  *                         pages on a node.
    5671             :  * @nid: the preferred node ID where memory should be allocated
    5672             :  * @size: the number of bytes to allocate
    5673             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5674             :  *
    5675             :  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
    5676             :  * back.
    5677             :  *
    5678             :  * Return: pointer to the allocated area or %NULL in case of error.
    5679             :  */
    5680           0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
    5681             : {
    5682           0 :         unsigned int order = get_order(size);
    5683             :         struct page *p;
    5684             : 
    5685           0 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    5686           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    5687             : 
    5688           0 :         p = alloc_pages_node(nid, gfp_mask, order);
    5689           0 :         if (!p)
    5690             :                 return NULL;
    5691           0 :         return make_alloc_exact((unsigned long)page_address(p), order, size);
    5692             : }
    5693             : 
    5694             : /**
    5695             :  * free_pages_exact - release memory allocated via alloc_pages_exact()
    5696             :  * @virt: the value returned by alloc_pages_exact.
    5697             :  * @size: size of allocation, same value as passed to alloc_pages_exact().
    5698             :  *
    5699             :  * Release the memory allocated by a previous call to alloc_pages_exact.
    5700             :  */
    5701           0 : void free_pages_exact(void *virt, size_t size)
    5702             : {
    5703           0 :         unsigned long addr = (unsigned long)virt;
    5704           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    5705             : 
    5706           0 :         while (addr < end) {
    5707           0 :                 free_page(addr);
    5708           0 :                 addr += PAGE_SIZE;
    5709             :         }
    5710           0 : }
    5711             : EXPORT_SYMBOL(free_pages_exact);
    5712             : 
    5713             : /**
    5714             :  * nr_free_zone_pages - count number of pages beyond high watermark
    5715             :  * @offset: The zone index of the highest zone
    5716             :  *
    5717             :  * nr_free_zone_pages() counts the number of pages which are beyond the
    5718             :  * high watermark within all zones at or below a given zone index.  For each
    5719             :  * zone, the number of pages is calculated as:
    5720             :  *
    5721             :  *     nr_free_zone_pages = managed_pages - high_pages
    5722             :  *
    5723             :  * Return: number of pages beyond high watermark.
    5724             :  */
    5725           3 : static unsigned long nr_free_zone_pages(int offset)
    5726             : {
    5727             :         struct zoneref *z;
    5728             :         struct zone *zone;
    5729             : 
    5730             :         /* Just pick one node, since fallback list is circular */
    5731           3 :         unsigned long sum = 0;
    5732             : 
    5733           6 :         struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
    5734             : 
    5735          12 :         for_each_zone_zonelist(zone, z, zonelist, offset) {
    5736           3 :                 unsigned long size = zone_managed_pages(zone);
    5737           3 :                 unsigned long high = high_wmark_pages(zone);
    5738           3 :                 if (size > high)
    5739           3 :                         sum += size - high;
    5740             :         }
    5741             : 
    5742           3 :         return sum;
    5743             : }
    5744             : 
    5745             : /**
    5746             :  * nr_free_buffer_pages - count number of pages beyond high watermark
    5747             :  *
    5748             :  * nr_free_buffer_pages() counts the number of pages which are beyond the high
    5749             :  * watermark within ZONE_DMA and ZONE_NORMAL.
    5750             :  *
    5751             :  * Return: number of pages beyond high watermark within ZONE_DMA and
    5752             :  * ZONE_NORMAL.
    5753             :  */
    5754           1 : unsigned long nr_free_buffer_pages(void)
    5755             : {
    5756           2 :         return nr_free_zone_pages(gfp_zone(GFP_USER));
    5757             : }
    5758             : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
    5759             : 
    5760             : static inline void show_node(struct zone *zone)
    5761             : {
    5762             :         if (IS_ENABLED(CONFIG_NUMA))
    5763             :                 printk("Node %d ", zone_to_nid(zone));
    5764             : }
    5765             : 
    5766           0 : long si_mem_available(void)
    5767             : {
    5768             :         long available;
    5769             :         unsigned long pagecache;
    5770           0 :         unsigned long wmark_low = 0;
    5771             :         unsigned long pages[NR_LRU_LISTS];
    5772             :         unsigned long reclaimable;
    5773             :         struct zone *zone;
    5774             :         int lru;
    5775             : 
    5776           0 :         for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
    5777           0 :                 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
    5778             : 
    5779           0 :         for_each_zone(zone)
    5780           0 :                 wmark_low += low_wmark_pages(zone);
    5781             : 
    5782             :         /*
    5783             :          * Estimate the amount of memory available for userspace allocations,
    5784             :          * without causing swapping.
    5785             :          */
    5786           0 :         available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
    5787             : 
    5788             :         /*
    5789             :          * Not all the page cache can be freed, otherwise the system will
    5790             :          * start swapping. Assume at least half of the page cache, or the
    5791             :          * low watermark worth of cache, needs to stay.
    5792             :          */
    5793           0 :         pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
    5794           0 :         pagecache -= min(pagecache / 2, wmark_low);
    5795           0 :         available += pagecache;
    5796             : 
    5797             :         /*
    5798             :          * Part of the reclaimable slab and other kernel memory consists of
    5799             :          * items that are in use, and cannot be freed. Cap this estimate at the
    5800             :          * low watermark.
    5801             :          */
    5802           0 :         reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
    5803           0 :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
    5804           0 :         available += reclaimable - min(reclaimable / 2, wmark_low);
    5805             : 
    5806           0 :         if (available < 0)
    5807           0 :                 available = 0;
    5808           0 :         return available;
    5809             : }
    5810             : EXPORT_SYMBOL_GPL(si_mem_available);
    5811             : 
    5812           3 : void si_meminfo(struct sysinfo *val)
    5813             : {
    5814           3 :         val->totalram = totalram_pages();
    5815           3 :         val->sharedram = global_node_page_state(NR_SHMEM);
    5816           3 :         val->freeram = global_zone_page_state(NR_FREE_PAGES);
    5817           3 :         val->bufferram = nr_blockdev_pages();
    5818           3 :         val->totalhigh = totalhigh_pages();
    5819           3 :         val->freehigh = nr_free_highpages();
    5820           3 :         val->mem_unit = PAGE_SIZE;
    5821           3 : }
    5822             : 
    5823             : EXPORT_SYMBOL(si_meminfo);
    5824             : 
    5825             : #ifdef CONFIG_NUMA
    5826             : void si_meminfo_node(struct sysinfo *val, int nid)
    5827             : {
    5828             :         int zone_type;          /* needs to be signed */
    5829             :         unsigned long managed_pages = 0;
    5830             :         unsigned long managed_highpages = 0;
    5831             :         unsigned long free_highpages = 0;
    5832             :         pg_data_t *pgdat = NODE_DATA(nid);
    5833             : 
    5834             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
    5835             :                 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
    5836             :         val->totalram = managed_pages;
    5837             :         val->sharedram = node_page_state(pgdat, NR_SHMEM);
    5838             :         val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
    5839             : #ifdef CONFIG_HIGHMEM
    5840             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
    5841             :                 struct zone *zone = &pgdat->node_zones[zone_type];
    5842             : 
    5843             :                 if (is_highmem(zone)) {
    5844             :                         managed_highpages += zone_managed_pages(zone);
    5845             :                         free_highpages += zone_page_state(zone, NR_FREE_PAGES);
    5846             :                 }
    5847             :         }
    5848             :         val->totalhigh = managed_highpages;
    5849             :         val->freehigh = free_highpages;
    5850             : #else
    5851             :         val->totalhigh = managed_highpages;
    5852             :         val->freehigh = free_highpages;
    5853             : #endif
    5854             :         val->mem_unit = PAGE_SIZE;
    5855             : }
    5856             : #endif
    5857             : 
    5858             : /*
    5859             :  * Determine whether the node should be displayed or not, depending on whether
    5860             :  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
    5861             :  */
    5862             : static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
    5863             : {
    5864           0 :         if (!(flags & SHOW_MEM_FILTER_NODES))
    5865             :                 return false;
    5866             : 
    5867             :         /*
    5868             :          * no node mask - aka implicit memory numa policy. Do not bother with
    5869             :          * the synchronization - read_mems_allowed_begin - because we do not
    5870             :          * have to be precise here.
    5871             :          */
    5872           0 :         if (!nodemask)
    5873           0 :                 nodemask = &cpuset_current_mems_allowed;
    5874             : 
    5875           0 :         return !node_isset(nid, *nodemask);
    5876             : }
    5877             : 
    5878             : #define K(x) ((x) << (PAGE_SHIFT-10))
    5879             : 
    5880           0 : static void show_migration_types(unsigned char type)
    5881             : {
    5882             :         static const char types[MIGRATE_TYPES] = {
    5883             :                 [MIGRATE_UNMOVABLE]     = 'U',
    5884             :                 [MIGRATE_MOVABLE]       = 'M',
    5885             :                 [MIGRATE_RECLAIMABLE]   = 'E',
    5886             :                 [MIGRATE_HIGHATOMIC]    = 'H',
    5887             : #ifdef CONFIG_CMA
    5888             :                 [MIGRATE_CMA]           = 'C',
    5889             : #endif
    5890             : #ifdef CONFIG_MEMORY_ISOLATION
    5891             :                 [MIGRATE_ISOLATE]       = 'I',
    5892             : #endif
    5893             :         };
    5894             :         char tmp[MIGRATE_TYPES + 1];
    5895           0 :         char *p = tmp;
    5896             :         int i;
    5897             : 
    5898           0 :         for (i = 0; i < MIGRATE_TYPES; i++) {
    5899           0 :                 if (type & (1 << i))
    5900           0 :                         *p++ = types[i];
    5901             :         }
    5902             : 
    5903           0 :         *p = '\0';
    5904           0 :         printk(KERN_CONT "(%s) ", tmp);
    5905           0 : }
    5906             : 
    5907             : /*
    5908             :  * Show free area list (used inside shift_scroll-lock stuff)
    5909             :  * We also calculate the percentage fragmentation. We do this by counting the
    5910             :  * memory on each free list with the exception of the first item on the list.
    5911             :  *
    5912             :  * Bits in @filter:
    5913             :  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
    5914             :  *   cpuset.
    5915             :  */
    5916           0 : void show_free_areas(unsigned int filter, nodemask_t *nodemask)
    5917             : {
    5918           0 :         unsigned long free_pcp = 0;
    5919             :         int cpu;
    5920             :         struct zone *zone;
    5921             :         pg_data_t *pgdat;
    5922             : 
    5923           0 :         for_each_populated_zone(zone) {
    5924           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5925           0 :                         continue;
    5926             : 
    5927           0 :                 for_each_online_cpu(cpu)
    5928           0 :                         free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
    5929             :         }
    5930             : 
    5931           0 :         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
    5932             :                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
    5933             :                 " unevictable:%lu dirty:%lu writeback:%lu\n"
    5934             :                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
    5935             :                 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
    5936             :                 " kernel_misc_reclaimable:%lu\n"
    5937             :                 " free:%lu free_pcp:%lu free_cma:%lu\n",
    5938             :                 global_node_page_state(NR_ACTIVE_ANON),
    5939             :                 global_node_page_state(NR_INACTIVE_ANON),
    5940             :                 global_node_page_state(NR_ISOLATED_ANON),
    5941             :                 global_node_page_state(NR_ACTIVE_FILE),
    5942             :                 global_node_page_state(NR_INACTIVE_FILE),
    5943             :                 global_node_page_state(NR_ISOLATED_FILE),
    5944             :                 global_node_page_state(NR_UNEVICTABLE),
    5945             :                 global_node_page_state(NR_FILE_DIRTY),
    5946             :                 global_node_page_state(NR_WRITEBACK),
    5947             :                 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
    5948             :                 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
    5949             :                 global_node_page_state(NR_FILE_MAPPED),
    5950             :                 global_node_page_state(NR_SHMEM),
    5951             :                 global_node_page_state(NR_PAGETABLE),
    5952             :                 global_zone_page_state(NR_BOUNCE),
    5953             :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
    5954             :                 global_zone_page_state(NR_FREE_PAGES),
    5955             :                 free_pcp,
    5956             :                 global_zone_page_state(NR_FREE_CMA_PAGES));
    5957             : 
    5958           0 :         for_each_online_pgdat(pgdat) {
    5959           0 :                 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
    5960           0 :                         continue;
    5961             : 
    5962           0 :                 printk("Node %d"
    5963             :                         " active_anon:%lukB"
    5964             :                         " inactive_anon:%lukB"
    5965             :                         " active_file:%lukB"
    5966             :                         " inactive_file:%lukB"
    5967             :                         " unevictable:%lukB"
    5968             :                         " isolated(anon):%lukB"
    5969             :                         " isolated(file):%lukB"
    5970             :                         " mapped:%lukB"
    5971             :                         " dirty:%lukB"
    5972             :                         " writeback:%lukB"
    5973             :                         " shmem:%lukB"
    5974             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    5975             :                         " shmem_thp: %lukB"
    5976             :                         " shmem_pmdmapped: %lukB"
    5977             :                         " anon_thp: %lukB"
    5978             : #endif
    5979             :                         " writeback_tmp:%lukB"
    5980             :                         " kernel_stack:%lukB"
    5981             : #ifdef CONFIG_SHADOW_CALL_STACK
    5982             :                         " shadow_call_stack:%lukB"
    5983             : #endif
    5984             :                         " pagetables:%lukB"
    5985             :                         " all_unreclaimable? %s"
    5986             :                         "\n",
    5987             :                         pgdat->node_id,
    5988             :                         K(node_page_state(pgdat, NR_ACTIVE_ANON)),
    5989             :                         K(node_page_state(pgdat, NR_INACTIVE_ANON)),
    5990             :                         K(node_page_state(pgdat, NR_ACTIVE_FILE)),
    5991             :                         K(node_page_state(pgdat, NR_INACTIVE_FILE)),
    5992             :                         K(node_page_state(pgdat, NR_UNEVICTABLE)),
    5993             :                         K(node_page_state(pgdat, NR_ISOLATED_ANON)),
    5994             :                         K(node_page_state(pgdat, NR_ISOLATED_FILE)),
    5995             :                         K(node_page_state(pgdat, NR_FILE_MAPPED)),
    5996             :                         K(node_page_state(pgdat, NR_FILE_DIRTY)),
    5997             :                         K(node_page_state(pgdat, NR_WRITEBACK)),
    5998             :                         K(node_page_state(pgdat, NR_SHMEM)),
    5999             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    6000             :                         K(node_page_state(pgdat, NR_SHMEM_THPS)),
    6001             :                         K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
    6002             :                         K(node_page_state(pgdat, NR_ANON_THPS)),
    6003             : #endif
    6004             :                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
    6005             :                         node_page_state(pgdat, NR_KERNEL_STACK_KB),
    6006             : #ifdef CONFIG_SHADOW_CALL_STACK
    6007             :                         node_page_state(pgdat, NR_KERNEL_SCS_KB),
    6008             : #endif
    6009             :                         K(node_page_state(pgdat, NR_PAGETABLE)),
    6010             :                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
    6011             :                                 "yes" : "no");
    6012             :         }
    6013             : 
    6014           0 :         for_each_populated_zone(zone) {
    6015             :                 int i;
    6016             : 
    6017           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    6018           0 :                         continue;
    6019             : 
    6020             :                 free_pcp = 0;
    6021           0 :                 for_each_online_cpu(cpu)
    6022           0 :                         free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
    6023             : 
    6024           0 :                 show_node(zone);
    6025           0 :                 printk(KERN_CONT
    6026             :                         "%s"
    6027             :                         " free:%lukB"
    6028             :                         " boost:%lukB"
    6029             :                         " min:%lukB"
    6030             :                         " low:%lukB"
    6031             :                         " high:%lukB"
    6032             :                         " reserved_highatomic:%luKB"
    6033             :                         " active_anon:%lukB"
    6034             :                         " inactive_anon:%lukB"
    6035             :                         " active_file:%lukB"
    6036             :                         " inactive_file:%lukB"
    6037             :                         " unevictable:%lukB"
    6038             :                         " writepending:%lukB"
    6039             :                         " present:%lukB"
    6040             :                         " managed:%lukB"
    6041             :                         " mlocked:%lukB"
    6042             :                         " bounce:%lukB"
    6043             :                         " free_pcp:%lukB"
    6044             :                         " local_pcp:%ukB"
    6045             :                         " free_cma:%lukB"
    6046             :                         "\n",
    6047             :                         zone->name,
    6048             :                         K(zone_page_state(zone, NR_FREE_PAGES)),
    6049             :                         K(zone->watermark_boost),
    6050             :                         K(min_wmark_pages(zone)),
    6051             :                         K(low_wmark_pages(zone)),
    6052             :                         K(high_wmark_pages(zone)),
    6053             :                         K(zone->nr_reserved_highatomic),
    6054             :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
    6055             :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
    6056             :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
    6057             :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
    6058             :                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
    6059             :                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
    6060             :                         K(zone->present_pages),
    6061             :                         K(zone_managed_pages(zone)),
    6062             :                         K(zone_page_state(zone, NR_MLOCK)),
    6063             :                         K(zone_page_state(zone, NR_BOUNCE)),
    6064             :                         K(free_pcp),
    6065             :                         K(this_cpu_read(zone->per_cpu_pageset->count)),
    6066             :                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
    6067           0 :                 printk("lowmem_reserve[]:");
    6068           0 :                 for (i = 0; i < MAX_NR_ZONES; i++)
    6069           0 :                         printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
    6070           0 :                 printk(KERN_CONT "\n");
    6071             :         }
    6072             : 
    6073           0 :         for_each_populated_zone(zone) {
    6074             :                 unsigned int order;
    6075           0 :                 unsigned long nr[MAX_ORDER], flags, total = 0;
    6076             :                 unsigned char types[MAX_ORDER];
    6077             : 
    6078           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    6079           0 :                         continue;
    6080           0 :                 show_node(zone);
    6081           0 :                 printk(KERN_CONT "%s: ", zone->name);
    6082             : 
    6083           0 :                 spin_lock_irqsave(&zone->lock, flags);
    6084           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    6085           0 :                         struct free_area *area = &zone->free_area[order];
    6086             :                         int type;
    6087             : 
    6088           0 :                         nr[order] = area->nr_free;
    6089           0 :                         total += nr[order] << order;
    6090             : 
    6091           0 :                         types[order] = 0;
    6092           0 :                         for (type = 0; type < MIGRATE_TYPES; type++) {
    6093           0 :                                 if (!free_area_empty(area, type))
    6094           0 :                                         types[order] |= 1 << type;
    6095             :                         }
    6096             :                 }
    6097           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    6098           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    6099           0 :                         printk(KERN_CONT "%lu*%lukB ",
    6100             :                                nr[order], K(1UL) << order);
    6101           0 :                         if (nr[order])
    6102           0 :                                 show_migration_types(types[order]);
    6103             :                 }
    6104           0 :                 printk(KERN_CONT "= %lukB\n", K(total));
    6105             :         }
    6106             : 
    6107           0 :         hugetlb_show_meminfo();
    6108             : 
    6109           0 :         printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
    6110             : 
    6111           0 :         show_swap_cache_info();
    6112           0 : }
    6113             : 
    6114             : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
    6115             : {
    6116           1 :         zoneref->zone = zone;
    6117           1 :         zoneref->zone_idx = zone_idx(zone);
    6118             : }
    6119             : 
    6120             : /*
    6121             :  * Builds allocation fallback zone lists.
    6122             :  *
    6123             :  * Add all populated zones of a node to the zonelist.
    6124             :  */
    6125             : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
    6126             : {
    6127             :         struct zone *zone;
    6128           1 :         enum zone_type zone_type = MAX_NR_ZONES;
    6129           1 :         int nr_zones = 0;
    6130             : 
    6131             :         do {
    6132           2 :                 zone_type--;
    6133           2 :                 zone = pgdat->node_zones + zone_type;
    6134           2 :                 if (populated_zone(zone)) {
    6135           2 :                         zoneref_set_zone(zone, &zonerefs[nr_zones++]);
    6136           1 :                         check_highest_zone(zone_type);
    6137             :                 }
    6138           2 :         } while (zone_type);
    6139             : 
    6140             :         return nr_zones;
    6141             : }
    6142             : 
    6143             : #ifdef CONFIG_NUMA
    6144             : 
    6145             : static int __parse_numa_zonelist_order(char *s)
    6146             : {
    6147             :         /*
    6148             :          * We used to support different zonelists modes but they turned
    6149             :          * out to be just not useful. Let's keep the warning in place
    6150             :          * if somebody still use the cmd line parameter so that we do
    6151             :          * not fail it silently
    6152             :          */
    6153             :         if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
    6154             :                 pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
    6155             :                 return -EINVAL;
    6156             :         }
    6157             :         return 0;
    6158             : }
    6159             : 
    6160             : char numa_zonelist_order[] = "Node";
    6161             : 
    6162             : /*
    6163             :  * sysctl handler for numa_zonelist_order
    6164             :  */
    6165             : int numa_zonelist_order_handler(struct ctl_table *table, int write,
    6166             :                 void *buffer, size_t *length, loff_t *ppos)
    6167             : {
    6168             :         if (write)
    6169             :                 return __parse_numa_zonelist_order(buffer);
    6170             :         return proc_dostring(table, write, buffer, length, ppos);
    6171             : }
    6172             : 
    6173             : 
    6174             : #define MAX_NODE_LOAD (nr_online_nodes)
    6175             : static int node_load[MAX_NUMNODES];
    6176             : 
    6177             : /**
    6178             :  * find_next_best_node - find the next node that should appear in a given node's fallback list
    6179             :  * @node: node whose fallback list we're appending
    6180             :  * @used_node_mask: nodemask_t of already used nodes
    6181             :  *
    6182             :  * We use a number of factors to determine which is the next node that should
    6183             :  * appear on a given node's fallback list.  The node should not have appeared
    6184             :  * already in @node's fallback list, and it should be the next closest node
    6185             :  * according to the distance array (which contains arbitrary distance values
    6186             :  * from each node to each node in the system), and should also prefer nodes
    6187             :  * with no CPUs, since presumably they'll have very little allocation pressure
    6188             :  * on them otherwise.
    6189             :  *
    6190             :  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
    6191             :  */
    6192             : int find_next_best_node(int node, nodemask_t *used_node_mask)
    6193             : {
    6194             :         int n, val;
    6195             :         int min_val = INT_MAX;
    6196             :         int best_node = NUMA_NO_NODE;
    6197             : 
    6198             :         /* Use the local node if we haven't already */
    6199             :         if (!node_isset(node, *used_node_mask)) {
    6200             :                 node_set(node, *used_node_mask);
    6201             :                 return node;
    6202             :         }
    6203             : 
    6204             :         for_each_node_state(n, N_MEMORY) {
    6205             : 
    6206             :                 /* Don't want a node to appear more than once */
    6207             :                 if (node_isset(n, *used_node_mask))
    6208             :                         continue;
    6209             : 
    6210             :                 /* Use the distance array to find the distance */
    6211             :                 val = node_distance(node, n);
    6212             : 
    6213             :                 /* Penalize nodes under us ("prefer the next node") */
    6214             :                 val += (n < node);
    6215             : 
    6216             :                 /* Give preference to headless and unused nodes */
    6217             :                 if (!cpumask_empty(cpumask_of_node(n)))
    6218             :                         val += PENALTY_FOR_NODE_WITH_CPUS;
    6219             : 
    6220             :                 /* Slight preference for less loaded node */
    6221             :                 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
    6222             :                 val += node_load[n];
    6223             : 
    6224             :                 if (val < min_val) {
    6225             :                         min_val = val;
    6226             :                         best_node = n;
    6227             :                 }
    6228             :         }
    6229             : 
    6230             :         if (best_node >= 0)
    6231             :                 node_set(best_node, *used_node_mask);
    6232             : 
    6233             :         return best_node;
    6234             : }
    6235             : 
    6236             : 
    6237             : /*
    6238             :  * Build zonelists ordered by node and zones within node.
    6239             :  * This results in maximum locality--normal zone overflows into local
    6240             :  * DMA zone, if any--but risks exhausting DMA zone.
    6241             :  */
    6242             : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
    6243             :                 unsigned nr_nodes)
    6244             : {
    6245             :         struct zoneref *zonerefs;
    6246             :         int i;
    6247             : 
    6248             :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    6249             : 
    6250             :         for (i = 0; i < nr_nodes; i++) {
    6251             :                 int nr_zones;
    6252             : 
    6253             :                 pg_data_t *node = NODE_DATA(node_order[i]);
    6254             : 
    6255             :                 nr_zones = build_zonerefs_node(node, zonerefs);
    6256             :                 zonerefs += nr_zones;
    6257             :         }
    6258             :         zonerefs->zone = NULL;
    6259             :         zonerefs->zone_idx = 0;
    6260             : }
    6261             : 
    6262             : /*
    6263             :  * Build gfp_thisnode zonelists
    6264             :  */
    6265             : static void build_thisnode_zonelists(pg_data_t *pgdat)
    6266             : {
    6267             :         struct zoneref *zonerefs;
    6268             :         int nr_zones;
    6269             : 
    6270             :         zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
    6271             :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    6272             :         zonerefs += nr_zones;
    6273             :         zonerefs->zone = NULL;
    6274             :         zonerefs->zone_idx = 0;
    6275             : }
    6276             : 
    6277             : /*
    6278             :  * Build zonelists ordered by zone and nodes within zones.
    6279             :  * This results in conserving DMA zone[s] until all Normal memory is
    6280             :  * exhausted, but results in overflowing to remote node while memory
    6281             :  * may still exist in local DMA zone.
    6282             :  */
    6283             : 
    6284             : static void build_zonelists(pg_data_t *pgdat)
    6285             : {
    6286             :         static int node_order[MAX_NUMNODES];
    6287             :         int node, load, nr_nodes = 0;
    6288             :         nodemask_t used_mask = NODE_MASK_NONE;
    6289             :         int local_node, prev_node;
    6290             : 
    6291             :         /* NUMA-aware ordering of nodes */
    6292             :         local_node = pgdat->node_id;
    6293             :         load = nr_online_nodes;
    6294             :         prev_node = local_node;
    6295             : 
    6296             :         memset(node_order, 0, sizeof(node_order));
    6297             :         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
    6298             :                 /*
    6299             :                  * We don't want to pressure a particular node.
    6300             :                  * So adding penalty to the first node in same
    6301             :                  * distance group to make it round-robin.
    6302             :                  */
    6303             :                 if (node_distance(local_node, node) !=
    6304             :                     node_distance(local_node, prev_node))
    6305             :                         node_load[node] += load;
    6306             : 
    6307             :                 node_order[nr_nodes++] = node;
    6308             :                 prev_node = node;
    6309             :                 load--;
    6310             :         }
    6311             : 
    6312             :         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
    6313             :         build_thisnode_zonelists(pgdat);
    6314             :         pr_info("Fallback order for Node %d: ", local_node);
    6315             :         for (node = 0; node < nr_nodes; node++)
    6316             :                 pr_cont("%d ", node_order[node]);
    6317             :         pr_cont("\n");
    6318             : }
    6319             : 
    6320             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    6321             : /*
    6322             :  * Return node id of node used for "local" allocations.
    6323             :  * I.e., first node id of first zone in arg node's generic zonelist.
    6324             :  * Used for initializing percpu 'numa_mem', which is used primarily
    6325             :  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
    6326             :  */
    6327             : int local_memory_node(int node)
    6328             : {
    6329             :         struct zoneref *z;
    6330             : 
    6331             :         z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
    6332             :                                    gfp_zone(GFP_KERNEL),
    6333             :                                    NULL);
    6334             :         return zone_to_nid(z->zone);
    6335             : }
    6336             : #endif
    6337             : 
    6338             : static void setup_min_unmapped_ratio(void);
    6339             : static void setup_min_slab_ratio(void);
    6340             : #else   /* CONFIG_NUMA */
    6341             : 
    6342           1 : static void build_zonelists(pg_data_t *pgdat)
    6343             : {
    6344             :         int node, local_node;
    6345             :         struct zoneref *zonerefs;
    6346             :         int nr_zones;
    6347             : 
    6348           1 :         local_node = pgdat->node_id;
    6349             : 
    6350           1 :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    6351           1 :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    6352           1 :         zonerefs += nr_zones;
    6353             : 
    6354             :         /*
    6355             :          * Now we build the zonelist so that it contains the zones
    6356             :          * of all the other nodes.
    6357             :          * We don't want to pressure a particular node, so when
    6358             :          * building the zones for node N, we make sure that the
    6359             :          * zones coming right after the local ones are those from
    6360             :          * node N+1 (modulo N)
    6361             :          */
    6362           1 :         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
    6363           0 :                 if (!node_online(node))
    6364           0 :                         continue;
    6365           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    6366           0 :                 zonerefs += nr_zones;
    6367             :         }
    6368           0 :         for (node = 0; node < local_node; node++) {
    6369           0 :                 if (!node_online(node))
    6370           0 :                         continue;
    6371           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    6372           0 :                 zonerefs += nr_zones;
    6373             :         }
    6374             : 
    6375           1 :         zonerefs->zone = NULL;
    6376           1 :         zonerefs->zone_idx = 0;
    6377           1 : }
    6378             : 
    6379             : #endif  /* CONFIG_NUMA */
    6380             : 
    6381             : /*
    6382             :  * Boot pageset table. One per cpu which is going to be used for all
    6383             :  * zones and all nodes. The parameters will be set in such a way
    6384             :  * that an item put on a list will immediately be handed over to
    6385             :  * the buddy list. This is safe since pageset manipulation is done
    6386             :  * with interrupts disabled.
    6387             :  *
    6388             :  * The boot_pagesets must be kept even after bootup is complete for
    6389             :  * unused processors and/or zones. They do play a role for bootstrapping
    6390             :  * hotplugged processors.
    6391             :  *
    6392             :  * zoneinfo_show() and maybe other functions do
    6393             :  * not check if the processor is online before following the pageset pointer.
    6394             :  * Other parts of the kernel may not check if the zone is available.
    6395             :  */
    6396             : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
    6397             : /* These effectively disable the pcplists in the boot pageset completely */
    6398             : #define BOOT_PAGESET_HIGH       0
    6399             : #define BOOT_PAGESET_BATCH      1
    6400             : static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
    6401             : static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
    6402             : DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
    6403             : 
    6404           0 : static void __build_all_zonelists(void *data)
    6405             : {
    6406             :         int nid;
    6407             :         int __maybe_unused cpu;
    6408           1 :         pg_data_t *self = data;
    6409             :         static DEFINE_SPINLOCK(lock);
    6410             : 
    6411           1 :         spin_lock(&lock);
    6412             : 
    6413             : #ifdef CONFIG_NUMA
    6414             :         memset(node_load, 0, sizeof(node_load));
    6415             : #endif
    6416             : 
    6417             :         /*
    6418             :          * This node is hotadded and no memory is yet present.   So just
    6419             :          * building zonelists is fine - no need to touch other nodes.
    6420             :          */
    6421           0 :         if (self && !node_online(self->node_id)) {
    6422           0 :                 build_zonelists(self);
    6423             :         } else {
    6424             :                 /*
    6425             :                  * All possible nodes have pgdat preallocated
    6426             :                  * in free_area_init
    6427             :                  */
    6428           1 :                 for_each_node(nid) {
    6429           1 :                         pg_data_t *pgdat = NODE_DATA(nid);
    6430             : 
    6431           1 :                         build_zonelists(pgdat);
    6432             :                 }
    6433             : 
    6434             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    6435             :                 /*
    6436             :                  * We now know the "local memory node" for each node--
    6437             :                  * i.e., the node of the first zone in the generic zonelist.
    6438             :                  * Set up numa_mem percpu variable for on-line cpus.  During
    6439             :                  * boot, only the boot cpu should be on-line;  we'll init the
    6440             :                  * secondary cpus' numa_mem as they come on-line.  During
    6441             :                  * node/memory hotplug, we'll fixup all on-line cpus.
    6442             :                  */
    6443             :                 for_each_online_cpu(cpu)
    6444             :                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
    6445             : #endif
    6446             :         }
    6447             : 
    6448           1 :         spin_unlock(&lock);
    6449           0 : }
    6450             : 
    6451             : static noinline void __init
    6452           1 : build_all_zonelists_init(void)
    6453             : {
    6454             :         int cpu;
    6455             : 
    6456           1 :         __build_all_zonelists(NULL);
    6457             : 
    6458             :         /*
    6459             :          * Initialize the boot_pagesets that are going to be used
    6460             :          * for bootstrapping processors. The real pagesets for
    6461             :          * each zone will be allocated later when the per cpu
    6462             :          * allocator is available.
    6463             :          *
    6464             :          * boot_pagesets are used also for bootstrapping offline
    6465             :          * cpus if the system is already booted because the pagesets
    6466             :          * are needed to initialize allocators on a specific cpu too.
    6467             :          * F.e. the percpu allocator needs the page allocator which
    6468             :          * needs the percpu allocator in order to allocate its pagesets
    6469             :          * (a chicken-egg dilemma).
    6470             :          */
    6471           2 :         for_each_possible_cpu(cpu)
    6472           1 :                 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
    6473             : 
    6474           1 :         mminit_verify_zonelist();
    6475             :         cpuset_init_current_mems_allowed();
    6476           1 : }
    6477             : 
    6478             : /*
    6479             :  * unless system_state == SYSTEM_BOOTING.
    6480             :  *
    6481             :  * __ref due to call of __init annotated helper build_all_zonelists_init
    6482             :  * [protected by SYSTEM_BOOTING].
    6483             :  */
    6484           1 : void __ref build_all_zonelists(pg_data_t *pgdat)
    6485             : {
    6486             :         unsigned long vm_total_pages;
    6487             : 
    6488           1 :         if (system_state == SYSTEM_BOOTING) {
    6489           1 :                 build_all_zonelists_init();
    6490             :         } else {
    6491           0 :                 __build_all_zonelists(pgdat);
    6492             :                 /* cpuset refresh routine should be here */
    6493             :         }
    6494             :         /* Get the number of free pages beyond high watermark in all zones. */
    6495           1 :         vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
    6496             :         /*
    6497             :          * Disable grouping by mobility if the number of pages in the
    6498             :          * system is too low to allow the mechanism to work. It would be
    6499             :          * more accurate, but expensive to check per-zone. This check is
    6500             :          * made on memory-hotadd so a system can start with mobility
    6501             :          * disabled and enable it later
    6502             :          */
    6503           1 :         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
    6504           0 :                 page_group_by_mobility_disabled = 1;
    6505             :         else
    6506           1 :                 page_group_by_mobility_disabled = 0;
    6507             : 
    6508           1 :         pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
    6509             :                 nr_online_nodes,
    6510             :                 page_group_by_mobility_disabled ? "off" : "on",
    6511             :                 vm_total_pages);
    6512             : #ifdef CONFIG_NUMA
    6513             :         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
    6514             : #endif
    6515           1 : }
    6516             : 
    6517             : /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
    6518             : static bool __meminit
    6519      266125 : overlap_memmap_init(unsigned long zone, unsigned long *pfn)
    6520             : {
    6521             :         static struct memblock_region *r;
    6522             : 
    6523      266125 :         if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
    6524           0 :                 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
    6525           0 :                         for_each_mem_region(r) {
    6526           0 :                                 if (*pfn < memblock_region_memory_end_pfn(r))
    6527             :                                         break;
    6528             :                         }
    6529             :                 }
    6530           0 :                 if (*pfn >= memblock_region_memory_base_pfn(r) &&
    6531           0 :                     memblock_is_mirror(r)) {
    6532           0 :                         *pfn = memblock_region_memory_end_pfn(r);
    6533           0 :                         return true;
    6534             :                 }
    6535             :         }
    6536             :         return false;
    6537             : }
    6538             : 
    6539             : /*
    6540             :  * Initially all pages are reserved - free ones are freed
    6541             :  * up by memblock_free_all() once the early boot process is
    6542             :  * done. Non-atomic initialization, single-pass.
    6543             :  *
    6544             :  * All aligned pageblocks are initialized to the specified migratetype
    6545             :  * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
    6546             :  * zone stats (e.g., nr_isolate_pageblock) are touched.
    6547             :  */
    6548           1 : void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
    6549             :                 unsigned long start_pfn, unsigned long zone_end_pfn,
    6550             :                 enum meminit_context context,
    6551             :                 struct vmem_altmap *altmap, int migratetype)
    6552             : {
    6553           1 :         unsigned long pfn, end_pfn = start_pfn + size;
    6554             :         struct page *page;
    6555             : 
    6556           1 :         if (highest_memmap_pfn < end_pfn - 1)
    6557           1 :                 highest_memmap_pfn = end_pfn - 1;
    6558             : 
    6559             : #ifdef CONFIG_ZONE_DEVICE
    6560             :         /*
    6561             :          * Honor reservation requested by the driver for this ZONE_DEVICE
    6562             :          * memory. We limit the total number of pages to initialize to just
    6563             :          * those that might contain the memory mapping. We will defer the
    6564             :          * ZONE_DEVICE page initialization until after we have released
    6565             :          * the hotplug lock.
    6566             :          */
    6567             :         if (zone == ZONE_DEVICE) {
    6568             :                 if (!altmap)
    6569             :                         return;
    6570             : 
    6571             :                 if (start_pfn == altmap->base_pfn)
    6572             :                         start_pfn += altmap->reserve;
    6573             :                 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
    6574             :         }
    6575             : #endif
    6576             : 
    6577      266127 :         for (pfn = start_pfn; pfn < end_pfn; ) {
    6578             :                 /*
    6579             :                  * There can be holes in boot-time mem_map[]s handed to this
    6580             :                  * function.  They do not exist on hotplugged memory.
    6581             :                  */
    6582      266125 :                 if (context == MEMINIT_EARLY) {
    6583      266125 :                         if (overlap_memmap_init(zone, &pfn))
    6584           0 :                                 continue;
    6585             :                         if (defer_init(nid, pfn, zone_end_pfn))
    6586             :                                 break;
    6587             :                 }
    6588             : 
    6589      266125 :                 page = pfn_to_page(pfn);
    6590      266125 :                 __init_single_page(page, pfn, zone, nid);
    6591      266125 :                 if (context == MEMINIT_HOTPLUG)
    6592             :                         __SetPageReserved(page);
    6593             : 
    6594             :                 /*
    6595             :                  * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
    6596             :                  * such that unmovable allocations won't be scattered all
    6597             :                  * over the place during system boot.
    6598             :                  */
    6599      266125 :                 if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
    6600         260 :                         set_pageblock_migratetype(page, migratetype);
    6601         260 :                         cond_resched();
    6602             :                 }
    6603      266125 :                 pfn++;
    6604             :         }
    6605           1 : }
    6606             : 
    6607             : #ifdef CONFIG_ZONE_DEVICE
    6608             : static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
    6609             :                                           unsigned long zone_idx, int nid,
    6610             :                                           struct dev_pagemap *pgmap)
    6611             : {
    6612             : 
    6613             :         __init_single_page(page, pfn, zone_idx, nid);
    6614             : 
    6615             :         /*
    6616             :          * Mark page reserved as it will need to wait for onlining
    6617             :          * phase for it to be fully associated with a zone.
    6618             :          *
    6619             :          * We can use the non-atomic __set_bit operation for setting
    6620             :          * the flag as we are still initializing the pages.
    6621             :          */
    6622             :         __SetPageReserved(page);
    6623             : 
    6624             :         /*
    6625             :          * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
    6626             :          * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
    6627             :          * ever freed or placed on a driver-private list.
    6628             :          */
    6629             :         page->pgmap = pgmap;
    6630             :         page->zone_device_data = NULL;
    6631             : 
    6632             :         /*
    6633             :          * Mark the block movable so that blocks are reserved for
    6634             :          * movable at startup. This will force kernel allocations
    6635             :          * to reserve their blocks rather than leaking throughout
    6636             :          * the address space during boot when many long-lived
    6637             :          * kernel allocations are made.
    6638             :          *
    6639             :          * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
    6640             :          * because this is done early in section_activate()
    6641             :          */
    6642             :         if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
    6643             :                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    6644             :                 cond_resched();
    6645             :         }
    6646             : }
    6647             : 
    6648             : static void __ref memmap_init_compound(struct page *head,
    6649             :                                        unsigned long head_pfn,
    6650             :                                        unsigned long zone_idx, int nid,
    6651             :                                        struct dev_pagemap *pgmap,
    6652             :                                        unsigned long nr_pages)
    6653             : {
    6654             :         unsigned long pfn, end_pfn = head_pfn + nr_pages;
    6655             :         unsigned int order = pgmap->vmemmap_shift;
    6656             : 
    6657             :         __SetPageHead(head);
    6658             :         for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
    6659             :                 struct page *page = pfn_to_page(pfn);
    6660             : 
    6661             :                 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
    6662             :                 prep_compound_tail(head, pfn - head_pfn);
    6663             :                 set_page_count(page, 0);
    6664             : 
    6665             :                 /*
    6666             :                  * The first tail page stores compound_mapcount_ptr() and
    6667             :                  * compound_order() and the second tail page stores
    6668             :                  * compound_pincount_ptr(). Call prep_compound_head() after
    6669             :                  * the first and second tail pages have been initialized to
    6670             :                  * not have the data overwritten.
    6671             :                  */
    6672             :                 if (pfn == head_pfn + 2)
    6673             :                         prep_compound_head(head, order);
    6674             :         }
    6675             : }
    6676             : 
    6677             : void __ref memmap_init_zone_device(struct zone *zone,
    6678             :                                    unsigned long start_pfn,
    6679             :                                    unsigned long nr_pages,
    6680             :                                    struct dev_pagemap *pgmap)
    6681             : {
    6682             :         unsigned long pfn, end_pfn = start_pfn + nr_pages;
    6683             :         struct pglist_data *pgdat = zone->zone_pgdat;
    6684             :         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
    6685             :         unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
    6686             :         unsigned long zone_idx = zone_idx(zone);
    6687             :         unsigned long start = jiffies;
    6688             :         int nid = pgdat->node_id;
    6689             : 
    6690             :         if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
    6691             :                 return;
    6692             : 
    6693             :         /*
    6694             :          * The call to memmap_init should have already taken care
    6695             :          * of the pages reserved for the memmap, so we can just jump to
    6696             :          * the end of that region and start processing the device pages.
    6697             :          */
    6698             :         if (altmap) {
    6699             :                 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
    6700             :                 nr_pages = end_pfn - start_pfn;
    6701             :         }
    6702             : 
    6703             :         for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
    6704             :                 struct page *page = pfn_to_page(pfn);
    6705             : 
    6706             :                 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
    6707             : 
    6708             :                 if (pfns_per_compound == 1)
    6709             :                         continue;
    6710             : 
    6711             :                 memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
    6712             :                                      pfns_per_compound);
    6713             :         }
    6714             : 
    6715             :         pr_info("%s initialised %lu pages in %ums\n", __func__,
    6716             :                 nr_pages, jiffies_to_msecs(jiffies - start));
    6717             : }
    6718             : 
    6719             : #endif
    6720           1 : static void __meminit zone_init_free_lists(struct zone *zone)
    6721             : {
    6722             :         unsigned int order, t;
    6723          45 :         for_each_migratetype_order(order, t) {
    6724          88 :                 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
    6725          44 :                 zone->free_area[order].nr_free = 0;
    6726             :         }
    6727           1 : }
    6728             : 
    6729             : /*
    6730             :  * Only struct pages that correspond to ranges defined by memblock.memory
    6731             :  * are zeroed and initialized by going through __init_single_page() during
    6732             :  * memmap_init_zone_range().
    6733             :  *
    6734             :  * But, there could be struct pages that correspond to holes in
    6735             :  * memblock.memory. This can happen because of the following reasons:
    6736             :  * - physical memory bank size is not necessarily the exact multiple of the
    6737             :  *   arbitrary section size
    6738             :  * - early reserved memory may not be listed in memblock.memory
    6739             :  * - memory layouts defined with memmap= kernel parameter may not align
    6740             :  *   nicely with memmap sections
    6741             :  *
    6742             :  * Explicitly initialize those struct pages so that:
    6743             :  * - PG_Reserved is set
    6744             :  * - zone and node links point to zone and node that span the page if the
    6745             :  *   hole is in the middle of a zone
    6746             :  * - zone and node links point to adjacent zone/node if the hole falls on
    6747             :  *   the zone boundary; the pages in such holes will be prepended to the
    6748             :  *   zone/node above the hole except for the trailing pages in the last
    6749             :  *   section that will be appended to the zone/node below.
    6750             :  */
    6751           1 : static void __init init_unavailable_range(unsigned long spfn,
    6752             :                                           unsigned long epfn,
    6753             :                                           int zone, int node)
    6754             : {
    6755             :         unsigned long pfn;
    6756           1 :         u64 pgcnt = 0;
    6757             : 
    6758           1 :         for (pfn = spfn; pfn < epfn; pfn++) {
    6759           0 :                 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
    6760           0 :                         pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
    6761             :                                 + pageblock_nr_pages - 1;
    6762           0 :                         continue;
    6763             :                 }
    6764           0 :                 __init_single_page(pfn_to_page(pfn), pfn, zone, node);
    6765           0 :                 __SetPageReserved(pfn_to_page(pfn));
    6766           0 :                 pgcnt++;
    6767             :         }
    6768             : 
    6769           1 :         if (pgcnt)
    6770           0 :                 pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
    6771             :                         node, zone_names[zone], pgcnt);
    6772           1 : }
    6773             : 
    6774           1 : static void __init memmap_init_zone_range(struct zone *zone,
    6775             :                                           unsigned long start_pfn,
    6776             :                                           unsigned long end_pfn,
    6777             :                                           unsigned long *hole_pfn)
    6778             : {
    6779           1 :         unsigned long zone_start_pfn = zone->zone_start_pfn;
    6780           1 :         unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
    6781           1 :         int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
    6782             : 
    6783           1 :         start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
    6784           1 :         end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
    6785             : 
    6786           1 :         if (start_pfn >= end_pfn)
    6787             :                 return;
    6788             : 
    6789           1 :         memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
    6790             :                           zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
    6791             : 
    6792           1 :         if (*hole_pfn < start_pfn)
    6793           0 :                 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
    6794             : 
    6795           1 :         *hole_pfn = end_pfn;
    6796             : }
    6797             : 
    6798           1 : static void __init memmap_init(void)
    6799             : {
    6800             :         unsigned long start_pfn, end_pfn;
    6801           1 :         unsigned long hole_pfn = 0;
    6802           1 :         int i, j, zone_id = 0, nid;
    6803             : 
    6804           2 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    6805             :                 struct pglist_data *node = NODE_DATA(nid);
    6806             : 
    6807           2 :                 for (j = 0; j < MAX_NR_ZONES; j++) {
    6808           2 :                         struct zone *zone = node->node_zones + j;
    6809             : 
    6810           2 :                         if (!populated_zone(zone))
    6811           1 :                                 continue;
    6812             : 
    6813           1 :                         memmap_init_zone_range(zone, start_pfn, end_pfn,
    6814             :                                                &hole_pfn);
    6815           1 :                         zone_id = j;
    6816             :                 }
    6817             :         }
    6818             : 
    6819             : #ifdef CONFIG_SPARSEMEM
    6820             :         /*
    6821             :          * Initialize the memory map for hole in the range [memory_end,
    6822             :          * section_end].
    6823             :          * Append the pages in this hole to the highest zone in the last
    6824             :          * node.
    6825             :          * The call to init_unavailable_range() is outside the ifdef to
    6826             :          * silence the compiler warining about zone_id set but not used;
    6827             :          * for FLATMEM it is a nop anyway
    6828             :          */
    6829             :         end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
    6830             :         if (hole_pfn < end_pfn)
    6831             : #endif
    6832           1 :                 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
    6833           1 : }
    6834             : 
    6835           1 : void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
    6836             :                           phys_addr_t min_addr, int nid, bool exact_nid)
    6837             : {
    6838             :         void *ptr;
    6839             : 
    6840           1 :         if (exact_nid)
    6841           0 :                 ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
    6842             :                                                    MEMBLOCK_ALLOC_ACCESSIBLE,
    6843             :                                                    nid);
    6844             :         else
    6845           1 :                 ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
    6846             :                                                  MEMBLOCK_ALLOC_ACCESSIBLE,
    6847             :                                                  nid);
    6848             : 
    6849             :         if (ptr && size > 0)
    6850             :                 page_init_poison(ptr, size);
    6851             : 
    6852           1 :         return ptr;
    6853             : }
    6854             : 
    6855           3 : static int zone_batchsize(struct zone *zone)
    6856             : {
    6857             : #ifdef CONFIG_MMU
    6858             :         int batch;
    6859             : 
    6860             :         /*
    6861             :          * The number of pages to batch allocate is either ~0.1%
    6862             :          * of the zone or 1MB, whichever is smaller. The batch
    6863             :          * size is striking a balance between allocation latency
    6864             :          * and zone lock contention.
    6865             :          */
    6866           3 :         batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
    6867           3 :         batch /= 4;             /* We effectively *= 4 below */
    6868           3 :         if (batch < 1)
    6869           1 :                 batch = 1;
    6870             : 
    6871             :         /*
    6872             :          * Clamp the batch to a 2^n - 1 value. Having a power
    6873             :          * of 2 value was found to be more likely to have
    6874             :          * suboptimal cache aliasing properties in some cases.
    6875             :          *
    6876             :          * For example if 2 tasks are alternately allocating
    6877             :          * batches of pages, one task can end up with a lot
    6878             :          * of pages of one half of the possible page colors
    6879             :          * and the other with pages of the other colors.
    6880             :          */
    6881           5 :         batch = rounddown_pow_of_two(batch + batch/2) - 1;
    6882             : 
    6883           3 :         return batch;
    6884             : 
    6885             : #else
    6886             :         /* The deferral and batching of frees should be suppressed under NOMMU
    6887             :          * conditions.
    6888             :          *
    6889             :          * The problem is that NOMMU needs to be able to allocate large chunks
    6890             :          * of contiguous memory as there's no hardware page translation to
    6891             :          * assemble apparent contiguous memory from discontiguous pages.
    6892             :          *
    6893             :          * Queueing large contiguous runs of pages for batching, however,
    6894             :          * causes the pages to actually be freed in smaller chunks.  As there
    6895             :          * can be a significant delay between the individual batches being
    6896             :          * recycled, this leads to the once large chunks of space being
    6897             :          * fragmented and becoming unavailable for high-order allocations.
    6898             :          */
    6899             :         return 0;
    6900             : #endif
    6901             : }
    6902             : 
    6903           3 : static int zone_highsize(struct zone *zone, int batch, int cpu_online)
    6904             : {
    6905             : #ifdef CONFIG_MMU
    6906             :         int high;
    6907             :         int nr_split_cpus;
    6908             :         unsigned long total_pages;
    6909             : 
    6910           3 :         if (!percpu_pagelist_high_fraction) {
    6911             :                 /*
    6912             :                  * By default, the high value of the pcp is based on the zone
    6913             :                  * low watermark so that if they are full then background
    6914             :                  * reclaim will not be started prematurely.
    6915             :                  */
    6916           3 :                 total_pages = low_wmark_pages(zone);
    6917             :         } else {
    6918             :                 /*
    6919             :                  * If percpu_pagelist_high_fraction is configured, the high
    6920             :                  * value is based on a fraction of the managed pages in the
    6921             :                  * zone.
    6922             :                  */
    6923           0 :                 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
    6924             :         }
    6925             : 
    6926             :         /*
    6927             :          * Split the high value across all online CPUs local to the zone. Note
    6928             :          * that early in boot that CPUs may not be online yet and that during
    6929             :          * CPU hotplug that the cpumask is not yet updated when a CPU is being
    6930             :          * onlined. For memory nodes that have no CPUs, split pcp->high across
    6931             :          * all online CPUs to mitigate the risk that reclaim is triggered
    6932             :          * prematurely due to pages stored on pcp lists.
    6933             :          */
    6934           6 :         nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
    6935           3 :         if (!nr_split_cpus)
    6936           0 :                 nr_split_cpus = num_online_cpus();
    6937           3 :         high = total_pages / nr_split_cpus;
    6938             : 
    6939             :         /*
    6940             :          * Ensure high is at least batch*4. The multiple is based on the
    6941             :          * historical relationship between high and batch.
    6942             :          */
    6943           3 :         high = max(high, batch << 2);
    6944             : 
    6945           3 :         return high;
    6946             : #else
    6947             :         return 0;
    6948             : #endif
    6949             : }
    6950             : 
    6951             : /*
    6952             :  * pcp->high and pcp->batch values are related and generally batch is lower
    6953             :  * than high. They are also related to pcp->count such that count is lower
    6954             :  * than high, and as soon as it reaches high, the pcplist is flushed.
    6955             :  *
    6956             :  * However, guaranteeing these relations at all times would require e.g. write
    6957             :  * barriers here but also careful usage of read barriers at the read side, and
    6958             :  * thus be prone to error and bad for performance. Thus the update only prevents
    6959             :  * store tearing. Any new users of pcp->batch and pcp->high should ensure they
    6960             :  * can cope with those fields changing asynchronously, and fully trust only the
    6961             :  * pcp->count field on the local CPU with interrupts disabled.
    6962             :  *
    6963             :  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
    6964             :  * outside of boot time (or some other assurance that no concurrent updaters
    6965             :  * exist).
    6966             :  */
    6967             : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
    6968             :                 unsigned long batch)
    6969             : {
    6970           3 :         WRITE_ONCE(pcp->batch, batch);
    6971           3 :         WRITE_ONCE(pcp->high, high);
    6972             : }
    6973             : 
    6974           2 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
    6975             : {
    6976             :         int pindex;
    6977             : 
    6978           2 :         memset(pcp, 0, sizeof(*pcp));
    6979           2 :         memset(pzstats, 0, sizeof(*pzstats));
    6980             : 
    6981          26 :         for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
    6982          48 :                 INIT_LIST_HEAD(&pcp->lists[pindex]);
    6983             : 
    6984             :         /*
    6985             :          * Set batch and high values safe for a boot pageset. A true percpu
    6986             :          * pageset's initialization will update them subsequently. Here we don't
    6987             :          * need to be as careful as pageset_update() as nobody can access the
    6988             :          * pageset yet.
    6989             :          */
    6990           2 :         pcp->high = BOOT_PAGESET_HIGH;
    6991           2 :         pcp->batch = BOOT_PAGESET_BATCH;
    6992           2 :         pcp->free_factor = 0;
    6993           2 : }
    6994             : 
    6995             : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
    6996             :                 unsigned long batch)
    6997             : {
    6998             :         struct per_cpu_pages *pcp;
    6999             :         int cpu;
    7000             : 
    7001           3 :         for_each_possible_cpu(cpu) {
    7002           3 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    7003           3 :                 pageset_update(pcp, high, batch);
    7004             :         }
    7005             : }
    7006             : 
    7007             : /*
    7008             :  * Calculate and set new high and batch values for all per-cpu pagesets of a
    7009             :  * zone based on the zone's size.
    7010             :  */
    7011           3 : static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
    7012             : {
    7013             :         int new_high, new_batch;
    7014             : 
    7015           3 :         new_batch = max(1, zone_batchsize(zone));
    7016           3 :         new_high = zone_highsize(zone, new_batch, cpu_online);
    7017             : 
    7018           3 :         if (zone->pageset_high == new_high &&
    7019           0 :             zone->pageset_batch == new_batch)
    7020             :                 return;
    7021             : 
    7022           3 :         zone->pageset_high = new_high;
    7023           3 :         zone->pageset_batch = new_batch;
    7024             : 
    7025           3 :         __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
    7026             : }
    7027             : 
    7028           1 : void __meminit setup_zone_pageset(struct zone *zone)
    7029             : {
    7030             :         int cpu;
    7031             : 
    7032             :         /* Size may be 0 on !SMP && !NUMA */
    7033             :         if (sizeof(struct per_cpu_zonestat) > 0)
    7034             :                 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
    7035             : 
    7036           1 :         zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
    7037           2 :         for_each_possible_cpu(cpu) {
    7038             :                 struct per_cpu_pages *pcp;
    7039             :                 struct per_cpu_zonestat *pzstats;
    7040             : 
    7041           1 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    7042           1 :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    7043           1 :                 per_cpu_pages_init(pcp, pzstats);
    7044             :         }
    7045             : 
    7046           1 :         zone_set_pageset_high_and_batch(zone, 0);
    7047           1 : }
    7048             : 
    7049             : /*
    7050             :  * Allocate per cpu pagesets and initialize them.
    7051             :  * Before this call only boot pagesets were available.
    7052             :  */
    7053           1 : void __init setup_per_cpu_pageset(void)
    7054             : {
    7055             :         struct pglist_data *pgdat;
    7056             :         struct zone *zone;
    7057             :         int __maybe_unused cpu;
    7058             : 
    7059           3 :         for_each_populated_zone(zone)
    7060           1 :                 setup_zone_pageset(zone);
    7061             : 
    7062             : #ifdef CONFIG_NUMA
    7063             :         /*
    7064             :          * Unpopulated zones continue using the boot pagesets.
    7065             :          * The numa stats for these pagesets need to be reset.
    7066             :          * Otherwise, they will end up skewing the stats of
    7067             :          * the nodes these zones are associated with.
    7068             :          */
    7069             :         for_each_possible_cpu(cpu) {
    7070             :                 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
    7071             :                 memset(pzstats->vm_numa_event, 0,
    7072             :                        sizeof(pzstats->vm_numa_event));
    7073             :         }
    7074             : #endif
    7075             : 
    7076           2 :         for_each_online_pgdat(pgdat)
    7077           1 :                 pgdat->per_cpu_nodestats =
    7078           1 :                         alloc_percpu(struct per_cpu_nodestat);
    7079           1 : }
    7080             : 
    7081             : static __meminit void zone_pcp_init(struct zone *zone)
    7082             : {
    7083             :         /*
    7084             :          * per cpu subsystem is not up at this point. The following code
    7085             :          * relies on the ability of the linker to provide the
    7086             :          * offset of a (static) per cpu variable into the per cpu area.
    7087             :          */
    7088           2 :         zone->per_cpu_pageset = &boot_pageset;
    7089           2 :         zone->per_cpu_zonestats = &boot_zonestats;
    7090           2 :         zone->pageset_high = BOOT_PAGESET_HIGH;
    7091           2 :         zone->pageset_batch = BOOT_PAGESET_BATCH;
    7092             : 
    7093           2 :         if (populated_zone(zone))
    7094             :                 pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
    7095             :                          zone->present_pages, zone_batchsize(zone));
    7096             : }
    7097             : 
    7098           1 : void __meminit init_currently_empty_zone(struct zone *zone,
    7099             :                                         unsigned long zone_start_pfn,
    7100             :                                         unsigned long size)
    7101             : {
    7102           1 :         struct pglist_data *pgdat = zone->zone_pgdat;
    7103           1 :         int zone_idx = zone_idx(zone) + 1;
    7104             : 
    7105           1 :         if (zone_idx > pgdat->nr_zones)
    7106           1 :                 pgdat->nr_zones = zone_idx;
    7107             : 
    7108           1 :         zone->zone_start_pfn = zone_start_pfn;
    7109             : 
    7110           1 :         mminit_dprintk(MMINIT_TRACE, "memmap_init",
    7111             :                         "Initialising map node %d zone %lu pfns %lu -> %lu\n",
    7112             :                         pgdat->node_id,
    7113             :                         (unsigned long)zone_idx(zone),
    7114             :                         zone_start_pfn, (zone_start_pfn + size));
    7115             : 
    7116           1 :         zone_init_free_lists(zone);
    7117           1 :         zone->initialized = 1;
    7118           1 : }
    7119             : 
    7120             : /**
    7121             :  * get_pfn_range_for_nid - Return the start and end page frames for a node
    7122             :  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
    7123             :  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
    7124             :  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
    7125             :  *
    7126             :  * It returns the start and end page frame of a node based on information
    7127             :  * provided by memblock_set_node(). If called for a node
    7128             :  * with no available memory, a warning is printed and the start and end
    7129             :  * PFNs will be 0.
    7130             :  */
    7131           1 : void __init get_pfn_range_for_nid(unsigned int nid,
    7132             :                         unsigned long *start_pfn, unsigned long *end_pfn)
    7133             : {
    7134             :         unsigned long this_start_pfn, this_end_pfn;
    7135             :         int i;
    7136             : 
    7137           1 :         *start_pfn = -1UL;
    7138           1 :         *end_pfn = 0;
    7139             : 
    7140           2 :         for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
    7141           1 :                 *start_pfn = min(*start_pfn, this_start_pfn);
    7142           1 :                 *end_pfn = max(*end_pfn, this_end_pfn);
    7143             :         }
    7144             : 
    7145           1 :         if (*start_pfn == -1UL)
    7146           0 :                 *start_pfn = 0;
    7147           1 : }
    7148             : 
    7149             : /*
    7150             :  * This finds a zone that can be used for ZONE_MOVABLE pages. The
    7151             :  * assumption is made that zones within a node are ordered in monotonic
    7152             :  * increasing memory addresses so that the "highest" populated zone is used
    7153             :  */
    7154           1 : static void __init find_usable_zone_for_movable(void)
    7155             : {
    7156             :         int zone_index;
    7157           2 :         for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
    7158           2 :                 if (zone_index == ZONE_MOVABLE)
    7159           1 :                         continue;
    7160             : 
    7161           2 :                 if (arch_zone_highest_possible_pfn[zone_index] >
    7162           1 :                                 arch_zone_lowest_possible_pfn[zone_index])
    7163             :                         break;
    7164             :         }
    7165             : 
    7166             :         VM_BUG_ON(zone_index == -1);
    7167           1 :         movable_zone = zone_index;
    7168           1 : }
    7169             : 
    7170             : /*
    7171             :  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
    7172             :  * because it is sized independent of architecture. Unlike the other zones,
    7173             :  * the starting point for ZONE_MOVABLE is not fixed. It may be different
    7174             :  * in each node depending on the size of each node and how evenly kernelcore
    7175             :  * is distributed. This helper function adjusts the zone ranges
    7176             :  * provided by the architecture for a given node by using the end of the
    7177             :  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
    7178             :  * zones within a node are in order of monotonic increases memory addresses
    7179             :  */
    7180           4 : static void __init adjust_zone_range_for_zone_movable(int nid,
    7181             :                                         unsigned long zone_type,
    7182             :                                         unsigned long node_start_pfn,
    7183             :                                         unsigned long node_end_pfn,
    7184             :                                         unsigned long *zone_start_pfn,
    7185             :                                         unsigned long *zone_end_pfn)
    7186             : {
    7187             :         /* Only adjust if ZONE_MOVABLE is on this node */
    7188           4 :         if (zone_movable_pfn[nid]) {
    7189             :                 /* Size ZONE_MOVABLE */
    7190           0 :                 if (zone_type == ZONE_MOVABLE) {
    7191           0 :                         *zone_start_pfn = zone_movable_pfn[nid];
    7192           0 :                         *zone_end_pfn = min(node_end_pfn,
    7193             :                                 arch_zone_highest_possible_pfn[movable_zone]);
    7194             : 
    7195             :                 /* Adjust for ZONE_MOVABLE starting within this range */
    7196           0 :                 } else if (!mirrored_kernelcore &&
    7197           0 :                         *zone_start_pfn < zone_movable_pfn[nid] &&
    7198           0 :                         *zone_end_pfn > zone_movable_pfn[nid]) {
    7199           0 :                         *zone_end_pfn = zone_movable_pfn[nid];
    7200             : 
    7201             :                 /* Check if this whole range is within ZONE_MOVABLE */
    7202           0 :                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
    7203           0 :                         *zone_start_pfn = *zone_end_pfn;
    7204             :         }
    7205           4 : }
    7206             : 
    7207             : /*
    7208             :  * Return the number of pages a zone spans in a node, including holes
    7209             :  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
    7210             :  */
    7211           2 : static unsigned long __init zone_spanned_pages_in_node(int nid,
    7212             :                                         unsigned long zone_type,
    7213             :                                         unsigned long node_start_pfn,
    7214             :                                         unsigned long node_end_pfn,
    7215             :                                         unsigned long *zone_start_pfn,
    7216             :                                         unsigned long *zone_end_pfn)
    7217             : {
    7218           2 :         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
    7219           2 :         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
    7220             :         /* When hotadd a new node from cpu_up(), the node should be empty */
    7221           2 :         if (!node_start_pfn && !node_end_pfn)
    7222             :                 return 0;
    7223             : 
    7224             :         /* Get the start and end of the zone */
    7225           2 :         *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
    7226           2 :         *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
    7227           2 :         adjust_zone_range_for_zone_movable(nid, zone_type,
    7228             :                                 node_start_pfn, node_end_pfn,
    7229             :                                 zone_start_pfn, zone_end_pfn);
    7230             : 
    7231             :         /* Check that this node has pages within the zone's required range */
    7232           2 :         if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
    7233             :                 return 0;
    7234             : 
    7235             :         /* Move the zone boundaries inside the node if necessary */
    7236           2 :         *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
    7237           2 :         *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
    7238             : 
    7239             :         /* Return the spanned pages */
    7240           2 :         return *zone_end_pfn - *zone_start_pfn;
    7241             : }
    7242             : 
    7243             : /*
    7244             :  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
    7245             :  * then all holes in the requested range will be accounted for.
    7246             :  */
    7247           2 : unsigned long __init __absent_pages_in_range(int nid,
    7248             :                                 unsigned long range_start_pfn,
    7249             :                                 unsigned long range_end_pfn)
    7250             : {
    7251           2 :         unsigned long nr_absent = range_end_pfn - range_start_pfn;
    7252             :         unsigned long start_pfn, end_pfn;
    7253             :         int i;
    7254             : 
    7255           4 :         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    7256           2 :                 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
    7257           2 :                 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
    7258           2 :                 nr_absent -= end_pfn - start_pfn;
    7259             :         }
    7260           2 :         return nr_absent;
    7261             : }
    7262             : 
    7263             : /**
    7264             :  * absent_pages_in_range - Return number of page frames in holes within a range
    7265             :  * @start_pfn: The start PFN to start searching for holes
    7266             :  * @end_pfn: The end PFN to stop searching for holes
    7267             :  *
    7268             :  * Return: the number of pages frames in memory holes within a range.
    7269             :  */
    7270           0 : unsigned long __init absent_pages_in_range(unsigned long start_pfn,
    7271             :                                                         unsigned long end_pfn)
    7272             : {
    7273           0 :         return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
    7274             : }
    7275             : 
    7276             : /* Return the number of page frames in holes in a zone on a node */
    7277           2 : static unsigned long __init zone_absent_pages_in_node(int nid,
    7278             :                                         unsigned long zone_type,
    7279             :                                         unsigned long node_start_pfn,
    7280             :                                         unsigned long node_end_pfn)
    7281             : {
    7282           2 :         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
    7283           2 :         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
    7284             :         unsigned long zone_start_pfn, zone_end_pfn;
    7285             :         unsigned long nr_absent;
    7286             : 
    7287             :         /* When hotadd a new node from cpu_up(), the node should be empty */
    7288           2 :         if (!node_start_pfn && !node_end_pfn)
    7289             :                 return 0;
    7290             : 
    7291           2 :         zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
    7292           2 :         zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
    7293             : 
    7294           2 :         adjust_zone_range_for_zone_movable(nid, zone_type,
    7295             :                         node_start_pfn, node_end_pfn,
    7296             :                         &zone_start_pfn, &zone_end_pfn);
    7297           2 :         nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
    7298             : 
    7299             :         /*
    7300             :          * ZONE_MOVABLE handling.
    7301             :          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
    7302             :          * and vice versa.
    7303             :          */
    7304           2 :         if (mirrored_kernelcore && zone_movable_pfn[nid]) {
    7305             :                 unsigned long start_pfn, end_pfn;
    7306             :                 struct memblock_region *r;
    7307             : 
    7308           0 :                 for_each_mem_region(r) {
    7309           0 :                         start_pfn = clamp(memblock_region_memory_base_pfn(r),
    7310             :                                           zone_start_pfn, zone_end_pfn);
    7311           0 :                         end_pfn = clamp(memblock_region_memory_end_pfn(r),
    7312             :                                         zone_start_pfn, zone_end_pfn);
    7313             : 
    7314           0 :                         if (zone_type == ZONE_MOVABLE &&
    7315           0 :                             memblock_is_mirror(r))
    7316           0 :                                 nr_absent += end_pfn - start_pfn;
    7317             : 
    7318           0 :                         if (zone_type == ZONE_NORMAL &&
    7319           0 :                             !memblock_is_mirror(r))
    7320           0 :                                 nr_absent += end_pfn - start_pfn;
    7321             :                 }
    7322             :         }
    7323             : 
    7324             :         return nr_absent;
    7325             : }
    7326             : 
    7327           1 : static void __init calculate_node_totalpages(struct pglist_data *pgdat,
    7328             :                                                 unsigned long node_start_pfn,
    7329             :                                                 unsigned long node_end_pfn)
    7330             : {
    7331           1 :         unsigned long realtotalpages = 0, totalpages = 0;
    7332             :         enum zone_type i;
    7333             : 
    7334           3 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    7335           2 :                 struct zone *zone = pgdat->node_zones + i;
    7336             :                 unsigned long zone_start_pfn, zone_end_pfn;
    7337             :                 unsigned long spanned, absent;
    7338             :                 unsigned long size, real_size;
    7339             : 
    7340           2 :                 spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
    7341             :                                                      node_start_pfn,
    7342             :                                                      node_end_pfn,
    7343             :                                                      &zone_start_pfn,
    7344             :                                                      &zone_end_pfn);
    7345           2 :                 absent = zone_absent_pages_in_node(pgdat->node_id, i,
    7346             :                                                    node_start_pfn,
    7347             :                                                    node_end_pfn);
    7348             : 
    7349           2 :                 size = spanned;
    7350           2 :                 real_size = size - absent;
    7351             : 
    7352           2 :                 if (size)
    7353           1 :                         zone->zone_start_pfn = zone_start_pfn;
    7354             :                 else
    7355           1 :                         zone->zone_start_pfn = 0;
    7356           2 :                 zone->spanned_pages = size;
    7357           2 :                 zone->present_pages = real_size;
    7358             : #if defined(CONFIG_MEMORY_HOTPLUG)
    7359             :                 zone->present_early_pages = real_size;
    7360             : #endif
    7361             : 
    7362           2 :                 totalpages += size;
    7363           2 :                 realtotalpages += real_size;
    7364             :         }
    7365             : 
    7366           1 :         pgdat->node_spanned_pages = totalpages;
    7367           1 :         pgdat->node_present_pages = realtotalpages;
    7368             :         pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
    7369           1 : }
    7370             : 
    7371             : #ifndef CONFIG_SPARSEMEM
    7372             : /*
    7373             :  * Calculate the size of the zone->blockflags rounded to an unsigned long
    7374             :  * Start by making sure zonesize is a multiple of pageblock_order by rounding
    7375             :  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
    7376             :  * round what is now in bits to nearest long in bits, then return it in
    7377             :  * bytes.
    7378             :  */
    7379           1 : static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
    7380             : {
    7381             :         unsigned long usemapsize;
    7382             : 
    7383           1 :         zonesize += zone_start_pfn & (pageblock_nr_pages-1);
    7384           1 :         usemapsize = roundup(zonesize, pageblock_nr_pages);
    7385           1 :         usemapsize = usemapsize >> pageblock_order;
    7386           1 :         usemapsize *= NR_PAGEBLOCK_BITS;
    7387           1 :         usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
    7388             : 
    7389           1 :         return usemapsize / 8;
    7390             : }
    7391             : 
    7392           1 : static void __ref setup_usemap(struct zone *zone)
    7393             : {
    7394           1 :         unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
    7395             :                                                zone->spanned_pages);
    7396           1 :         zone->pageblock_flags = NULL;
    7397           1 :         if (usemapsize) {
    7398           1 :                 zone->pageblock_flags =
    7399           2 :                         memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
    7400             :                                             zone_to_nid(zone));
    7401           1 :                 if (!zone->pageblock_flags)
    7402           0 :                         panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
    7403             :                               usemapsize, zone->name, zone_to_nid(zone));
    7404             :         }
    7405           1 : }
    7406             : #else
    7407             : static inline void setup_usemap(struct zone *zone) {}
    7408             : #endif /* CONFIG_SPARSEMEM */
    7409             : 
    7410             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
    7411             : 
    7412             : /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
    7413             : void __init set_pageblock_order(void)
    7414             : {
    7415             :         unsigned int order = MAX_ORDER - 1;
    7416             : 
    7417             :         /* Check that pageblock_nr_pages has not already been setup */
    7418             :         if (pageblock_order)
    7419             :                 return;
    7420             : 
    7421             :         /* Don't let pageblocks exceed the maximum allocation granularity. */
    7422             :         if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
    7423             :                 order = HUGETLB_PAGE_ORDER;
    7424             : 
    7425             :         /*
    7426             :          * Assume the largest contiguous order of interest is a huge page.
    7427             :          * This value may be variable depending on boot parameters on IA64 and
    7428             :          * powerpc.
    7429             :          */
    7430             :         pageblock_order = order;
    7431             : }
    7432             : #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
    7433             : 
    7434             : /*
    7435             :  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
    7436             :  * is unused as pageblock_order is set at compile-time. See
    7437             :  * include/linux/pageblock-flags.h for the values of pageblock_order based on
    7438             :  * the kernel config
    7439             :  */
    7440           0 : void __init set_pageblock_order(void)
    7441             : {
    7442           0 : }
    7443             : 
    7444             : #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
    7445             : 
    7446             : static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
    7447             :                                                 unsigned long present_pages)
    7448             : {
    7449           2 :         unsigned long pages = spanned_pages;
    7450             : 
    7451             :         /*
    7452             :          * Provide a more accurate estimation if there are holes within
    7453             :          * the zone and SPARSEMEM is in use. If there are holes within the
    7454             :          * zone, each populated memory region may cost us one or two extra
    7455             :          * memmap pages due to alignment because memmap pages for each
    7456             :          * populated regions may not be naturally aligned on page boundary.
    7457             :          * So the (present_pages >> 4) heuristic is a tradeoff for that.
    7458             :          */
    7459             :         if (spanned_pages > present_pages + (present_pages >> 4) &&
    7460             :             IS_ENABLED(CONFIG_SPARSEMEM))
    7461             :                 pages = present_pages;
    7462             : 
    7463           2 :         return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
    7464             : }
    7465             : 
    7466             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    7467             : static void pgdat_init_split_queue(struct pglist_data *pgdat)
    7468             : {
    7469             :         struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
    7470             : 
    7471             :         spin_lock_init(&ds_queue->split_queue_lock);
    7472             :         INIT_LIST_HEAD(&ds_queue->split_queue);
    7473             :         ds_queue->split_queue_len = 0;
    7474             : }
    7475             : #else
    7476             : static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
    7477             : #endif
    7478             : 
    7479             : #ifdef CONFIG_COMPACTION
    7480             : static void pgdat_init_kcompactd(struct pglist_data *pgdat)
    7481             : {
    7482           1 :         init_waitqueue_head(&pgdat->kcompactd_wait);
    7483             : }
    7484             : #else
    7485             : static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
    7486             : #endif
    7487             : 
    7488           1 : static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
    7489             : {
    7490             :         int i;
    7491             : 
    7492           1 :         pgdat_resize_init(pgdat);
    7493             : 
    7494           1 :         pgdat_init_split_queue(pgdat);
    7495           1 :         pgdat_init_kcompactd(pgdat);
    7496             : 
    7497           1 :         init_waitqueue_head(&pgdat->kswapd_wait);
    7498           1 :         init_waitqueue_head(&pgdat->pfmemalloc_wait);
    7499             : 
    7500           5 :         for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
    7501           4 :                 init_waitqueue_head(&pgdat->reclaim_wait[i]);
    7502             : 
    7503           1 :         pgdat_page_ext_init(pgdat);
    7504           1 :         lruvec_init(&pgdat->__lruvec);
    7505           1 : }
    7506             : 
    7507           2 : static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
    7508             :                                                         unsigned long remaining_pages)
    7509             : {
    7510           4 :         atomic_long_set(&zone->managed_pages, remaining_pages);
    7511           2 :         zone_set_nid(zone, nid);
    7512           2 :         zone->name = zone_names[idx];
    7513           2 :         zone->zone_pgdat = NODE_DATA(nid);
    7514           2 :         spin_lock_init(&zone->lock);
    7515           2 :         zone_seqlock_init(zone);
    7516           2 :         zone_pcp_init(zone);
    7517           2 : }
    7518             : 
    7519             : /*
    7520             :  * Set up the zone data structures
    7521             :  * - init pgdat internals
    7522             :  * - init all zones belonging to this node
    7523             :  *
    7524             :  * NOTE: this function is only called during memory hotplug
    7525             :  */
    7526             : #ifdef CONFIG_MEMORY_HOTPLUG
    7527             : void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
    7528             : {
    7529             :         int nid = pgdat->node_id;
    7530             :         enum zone_type z;
    7531             :         int cpu;
    7532             : 
    7533             :         pgdat_init_internals(pgdat);
    7534             : 
    7535             :         if (pgdat->per_cpu_nodestats == &boot_nodestats)
    7536             :                 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
    7537             : 
    7538             :         /*
    7539             :          * Reset the nr_zones, order and highest_zoneidx before reuse.
    7540             :          * Note that kswapd will init kswapd_highest_zoneidx properly
    7541             :          * when it starts in the near future.
    7542             :          */
    7543             :         pgdat->nr_zones = 0;
    7544             :         pgdat->kswapd_order = 0;
    7545             :         pgdat->kswapd_highest_zoneidx = 0;
    7546             :         pgdat->node_start_pfn = 0;
    7547             :         for_each_online_cpu(cpu) {
    7548             :                 struct per_cpu_nodestat *p;
    7549             : 
    7550             :                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
    7551             :                 memset(p, 0, sizeof(*p));
    7552             :         }
    7553             : 
    7554             :         for (z = 0; z < MAX_NR_ZONES; z++)
    7555             :                 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
    7556             : }
    7557             : #endif
    7558             : 
    7559             : /*
    7560             :  * Set up the zone data structures:
    7561             :  *   - mark all pages reserved
    7562             :  *   - mark all memory queues empty
    7563             :  *   - clear the memory bitmaps
    7564             :  *
    7565             :  * NOTE: pgdat should get zeroed by caller.
    7566             :  * NOTE: this function is only called during early init.
    7567             :  */
    7568           1 : static void __init free_area_init_core(struct pglist_data *pgdat)
    7569             : {
    7570             :         enum zone_type j;
    7571           1 :         int nid = pgdat->node_id;
    7572             : 
    7573           1 :         pgdat_init_internals(pgdat);
    7574           1 :         pgdat->per_cpu_nodestats = &boot_nodestats;
    7575             : 
    7576           3 :         for (j = 0; j < MAX_NR_ZONES; j++) {
    7577           2 :                 struct zone *zone = pgdat->node_zones + j;
    7578             :                 unsigned long size, freesize, memmap_pages;
    7579             : 
    7580           2 :                 size = zone->spanned_pages;
    7581           2 :                 freesize = zone->present_pages;
    7582             : 
    7583             :                 /*
    7584             :                  * Adjust freesize so that it accounts for how much memory
    7585             :                  * is used by this zone for memmap. This affects the watermark
    7586             :                  * and per-cpu initialisations
    7587             :                  */
    7588           4 :                 memmap_pages = calc_memmap_size(size, freesize);
    7589           2 :                 if (!is_highmem_idx(j)) {
    7590           2 :                         if (freesize >= memmap_pages) {
    7591           2 :                                 freesize -= memmap_pages;
    7592             :                                 if (memmap_pages)
    7593             :                                         pr_debug("  %s zone: %lu pages used for memmap\n",
    7594             :                                                  zone_names[j], memmap_pages);
    7595             :                         } else
    7596           0 :                                 pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
    7597             :                                         zone_names[j], memmap_pages, freesize);
    7598             :                 }
    7599             : 
    7600             :                 /* Account for reserved pages */
    7601           2 :                 if (j == 0 && freesize > dma_reserve) {
    7602           1 :                         freesize -= dma_reserve;
    7603             :                         pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
    7604             :                 }
    7605             : 
    7606           2 :                 if (!is_highmem_idx(j))
    7607           2 :                         nr_kernel_pages += freesize;
    7608             :                 /* Charge for highmem memmap if there are enough kernel pages */
    7609             :                 else if (nr_kernel_pages > memmap_pages * 2)
    7610             :                         nr_kernel_pages -= memmap_pages;
    7611           2 :                 nr_all_pages += freesize;
    7612             : 
    7613             :                 /*
    7614             :                  * Set an approximate value for lowmem here, it will be adjusted
    7615             :                  * when the bootmem allocator frees pages into the buddy system.
    7616             :                  * And all highmem pages will be managed by the buddy system.
    7617             :                  */
    7618           2 :                 zone_init_internals(zone, j, nid, freesize);
    7619             : 
    7620           2 :                 if (!size)
    7621           1 :                         continue;
    7622             : 
    7623             :                 set_pageblock_order();
    7624           1 :                 setup_usemap(zone);
    7625           1 :                 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
    7626             :         }
    7627           1 : }
    7628             : 
    7629             : #ifdef CONFIG_FLATMEM
    7630           1 : static void __init alloc_node_mem_map(struct pglist_data *pgdat)
    7631             : {
    7632           1 :         unsigned long __maybe_unused start = 0;
    7633           1 :         unsigned long __maybe_unused offset = 0;
    7634             : 
    7635             :         /* Skip empty nodes */
    7636           1 :         if (!pgdat->node_spanned_pages)
    7637             :                 return;
    7638             : 
    7639           1 :         start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
    7640           1 :         offset = pgdat->node_start_pfn - start;
    7641             :         /* ia64 gets its own node_mem_map, before this, without bootmem */
    7642           1 :         if (!pgdat->node_mem_map) {
    7643             :                 unsigned long size, end;
    7644             :                 struct page *map;
    7645             : 
    7646             :                 /*
    7647             :                  * The zone's endpoints aren't required to be MAX_ORDER
    7648             :                  * aligned but the node_mem_map endpoints must be in order
    7649             :                  * for the buddy allocator to function correctly.
    7650             :                  */
    7651           2 :                 end = pgdat_end_pfn(pgdat);
    7652           1 :                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
    7653           1 :                 size =  (end - start) * sizeof(struct page);
    7654           1 :                 map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
    7655             :                                    pgdat->node_id, false);
    7656           1 :                 if (!map)
    7657           0 :                         panic("Failed to allocate %ld bytes for node %d memory map\n",
    7658             :                               size, pgdat->node_id);
    7659           1 :                 pgdat->node_mem_map = map + offset;
    7660             :         }
    7661             :         pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
    7662             :                                 __func__, pgdat->node_id, (unsigned long)pgdat,
    7663             :                                 (unsigned long)pgdat->node_mem_map);
    7664             : #ifndef CONFIG_NUMA
    7665             :         /*
    7666             :          * With no DISCONTIG, the global mem_map is just set as node 0's
    7667             :          */
    7668           1 :         if (pgdat == NODE_DATA(0)) {
    7669           1 :                 mem_map = NODE_DATA(0)->node_mem_map;
    7670           1 :                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
    7671           0 :                         mem_map -= offset;
    7672             :         }
    7673             : #endif
    7674             : }
    7675             : #else
    7676             : static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
    7677             : #endif /* CONFIG_FLATMEM */
    7678             : 
    7679             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    7680             : static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
    7681             : {
    7682             :         pgdat->first_deferred_pfn = ULONG_MAX;
    7683             : }
    7684             : #else
    7685             : static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
    7686             : #endif
    7687             : 
    7688           1 : static void __init free_area_init_node(int nid)
    7689             : {
    7690           1 :         pg_data_t *pgdat = NODE_DATA(nid);
    7691           1 :         unsigned long start_pfn = 0;
    7692           1 :         unsigned long end_pfn = 0;
    7693             : 
    7694             :         /* pg_data_t should be reset to zero when it's allocated */
    7695           1 :         WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
    7696             : 
    7697           1 :         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
    7698             : 
    7699           1 :         pgdat->node_id = nid;
    7700           1 :         pgdat->node_start_pfn = start_pfn;
    7701           1 :         pgdat->per_cpu_nodestats = NULL;
    7702             : 
    7703           1 :         if (start_pfn != end_pfn) {
    7704           1 :                 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
    7705             :                         (u64)start_pfn << PAGE_SHIFT,
    7706             :                         end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
    7707             :         } else {
    7708           0 :                 pr_info("Initmem setup node %d as memoryless\n", nid);
    7709             :         }
    7710             : 
    7711           1 :         calculate_node_totalpages(pgdat, start_pfn, end_pfn);
    7712             : 
    7713           1 :         alloc_node_mem_map(pgdat);
    7714             :         pgdat_set_deferred_range(pgdat);
    7715             : 
    7716           1 :         free_area_init_core(pgdat);
    7717           1 : }
    7718             : 
    7719             : static void __init free_area_init_memoryless_node(int nid)
    7720             : {
    7721             :         free_area_init_node(nid);
    7722             : }
    7723             : 
    7724             : #if MAX_NUMNODES > 1
    7725             : /*
    7726             :  * Figure out the number of possible node ids.
    7727             :  */
    7728             : void __init setup_nr_node_ids(void)
    7729             : {
    7730             :         unsigned int highest;
    7731             : 
    7732             :         highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
    7733             :         nr_node_ids = highest + 1;
    7734             : }
    7735             : #endif
    7736             : 
    7737             : /**
    7738             :  * node_map_pfn_alignment - determine the maximum internode alignment
    7739             :  *
    7740             :  * This function should be called after node map is populated and sorted.
    7741             :  * It calculates the maximum power of two alignment which can distinguish
    7742             :  * all the nodes.
    7743             :  *
    7744             :  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
    7745             :  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
    7746             :  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
    7747             :  * shifted, 1GiB is enough and this function will indicate so.
    7748             :  *
    7749             :  * This is used to test whether pfn -> nid mapping of the chosen memory
    7750             :  * model has fine enough granularity to avoid incorrect mapping for the
    7751             :  * populated node map.
    7752             :  *
    7753             :  * Return: the determined alignment in pfn's.  0 if there is no alignment
    7754             :  * requirement (single node).
    7755             :  */
    7756           0 : unsigned long __init node_map_pfn_alignment(void)
    7757             : {
    7758           0 :         unsigned long accl_mask = 0, last_end = 0;
    7759             :         unsigned long start, end, mask;
    7760           0 :         int last_nid = NUMA_NO_NODE;
    7761             :         int i, nid;
    7762             : 
    7763           0 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
    7764           0 :                 if (!start || last_nid < 0 || last_nid == nid) {
    7765           0 :                         last_nid = nid;
    7766           0 :                         last_end = end;
    7767           0 :                         continue;
    7768             :                 }
    7769             : 
    7770             :                 /*
    7771             :                  * Start with a mask granular enough to pin-point to the
    7772             :                  * start pfn and tick off bits one-by-one until it becomes
    7773             :                  * too coarse to separate the current node from the last.
    7774             :                  */
    7775           0 :                 mask = ~((1 << __ffs(start)) - 1);
    7776           0 :                 while (mask && last_end <= (start & (mask << 1)))
    7777             :                         mask <<= 1;
    7778             : 
    7779             :                 /* accumulate all internode masks */
    7780           0 :                 accl_mask |= mask;
    7781             :         }
    7782             : 
    7783             :         /* convert mask to number of pages */
    7784           0 :         return ~accl_mask + 1;
    7785             : }
    7786             : 
    7787             : /**
    7788             :  * find_min_pfn_with_active_regions - Find the minimum PFN registered
    7789             :  *
    7790             :  * Return: the minimum PFN based on information provided via
    7791             :  * memblock_set_node().
    7792             :  */
    7793           1 : unsigned long __init find_min_pfn_with_active_regions(void)
    7794             : {
    7795           1 :         return PHYS_PFN(memblock_start_of_DRAM());
    7796             : }
    7797             : 
    7798             : /*
    7799             :  * early_calculate_totalpages()
    7800             :  * Sum pages in active regions for movable zone.
    7801             :  * Populate N_MEMORY for calculating usable_nodes.
    7802             :  */
    7803           1 : static unsigned long __init early_calculate_totalpages(void)
    7804             : {
    7805           1 :         unsigned long totalpages = 0;
    7806             :         unsigned long start_pfn, end_pfn;
    7807             :         int i, nid;
    7808             : 
    7809           2 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    7810           1 :                 unsigned long pages = end_pfn - start_pfn;
    7811             : 
    7812           1 :                 totalpages += pages;
    7813             :                 if (pages)
    7814             :                         node_set_state(nid, N_MEMORY);
    7815             :         }
    7816           1 :         return totalpages;
    7817             : }
    7818             : 
    7819             : /*
    7820             :  * Find the PFN the Movable zone begins in each node. Kernel memory
    7821             :  * is spread evenly between nodes as long as the nodes have enough
    7822             :  * memory. When they don't, some nodes will have more kernelcore than
    7823             :  * others
    7824             :  */
    7825           1 : static void __init find_zone_movable_pfns_for_nodes(void)
    7826             : {
    7827             :         int i, nid;
    7828             :         unsigned long usable_startpfn;
    7829             :         unsigned long kernelcore_node, kernelcore_remaining;
    7830             :         /* save the state before borrow the nodemask */
    7831           1 :         nodemask_t saved_node_state = node_states[N_MEMORY];
    7832           1 :         unsigned long totalpages = early_calculate_totalpages();
    7833           1 :         int usable_nodes = nodes_weight(node_states[N_MEMORY]);
    7834             :         struct memblock_region *r;
    7835             : 
    7836             :         /* Need to find movable_zone earlier when movable_node is specified. */
    7837           1 :         find_usable_zone_for_movable();
    7838             : 
    7839             :         /*
    7840             :          * If movable_node is specified, ignore kernelcore and movablecore
    7841             :          * options.
    7842             :          */
    7843             :         if (movable_node_is_enabled()) {
    7844             :                 for_each_mem_region(r) {
    7845             :                         if (!memblock_is_hotpluggable(r))
    7846             :                                 continue;
    7847             : 
    7848             :                         nid = memblock_get_region_node(r);
    7849             : 
    7850             :                         usable_startpfn = PFN_DOWN(r->base);
    7851             :                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
    7852             :                                 min(usable_startpfn, zone_movable_pfn[nid]) :
    7853             :                                 usable_startpfn;
    7854             :                 }
    7855             : 
    7856             :                 goto out2;
    7857             :         }
    7858             : 
    7859             :         /*
    7860             :          * If kernelcore=mirror is specified, ignore movablecore option
    7861             :          */
    7862           1 :         if (mirrored_kernelcore) {
    7863           0 :                 bool mem_below_4gb_not_mirrored = false;
    7864             : 
    7865           0 :                 for_each_mem_region(r) {
    7866           0 :                         if (memblock_is_mirror(r))
    7867           0 :                                 continue;
    7868             : 
    7869           0 :                         nid = memblock_get_region_node(r);
    7870             : 
    7871           0 :                         usable_startpfn = memblock_region_memory_base_pfn(r);
    7872             : 
    7873           0 :                         if (usable_startpfn < 0x100000) {
    7874           0 :                                 mem_below_4gb_not_mirrored = true;
    7875           0 :                                 continue;
    7876             :                         }
    7877             : 
    7878           0 :                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
    7879           0 :                                 min(usable_startpfn, zone_movable_pfn[nid]) :
    7880             :                                 usable_startpfn;
    7881             :                 }
    7882             : 
    7883           0 :                 if (mem_below_4gb_not_mirrored)
    7884           0 :                         pr_warn("This configuration results in unmirrored kernel memory.\n");
    7885             : 
    7886             :                 goto out2;
    7887             :         }
    7888             : 
    7889             :         /*
    7890             :          * If kernelcore=nn% or movablecore=nn% was specified, calculate the
    7891             :          * amount of necessary memory.
    7892             :          */
    7893           1 :         if (required_kernelcore_percent)
    7894           0 :                 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
    7895             :                                        10000UL;
    7896           1 :         if (required_movablecore_percent)
    7897           0 :                 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
    7898             :                                         10000UL;
    7899             : 
    7900             :         /*
    7901             :          * If movablecore= was specified, calculate what size of
    7902             :          * kernelcore that corresponds so that memory usable for
    7903             :          * any allocation type is evenly spread. If both kernelcore
    7904             :          * and movablecore are specified, then the value of kernelcore
    7905             :          * will be used for required_kernelcore if it's greater than
    7906             :          * what movablecore would have allowed.
    7907             :          */
    7908           1 :         if (required_movablecore) {
    7909             :                 unsigned long corepages;
    7910             : 
    7911             :                 /*
    7912             :                  * Round-up so that ZONE_MOVABLE is at least as large as what
    7913             :                  * was requested by the user
    7914             :                  */
    7915             :                 required_movablecore =
    7916           0 :                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
    7917           0 :                 required_movablecore = min(totalpages, required_movablecore);
    7918           0 :                 corepages = totalpages - required_movablecore;
    7919             : 
    7920           0 :                 required_kernelcore = max(required_kernelcore, corepages);
    7921             :         }
    7922             : 
    7923             :         /*
    7924             :          * If kernelcore was not specified or kernelcore size is larger
    7925             :          * than totalpages, there is no ZONE_MOVABLE.
    7926             :          */
    7927           1 :         if (!required_kernelcore || required_kernelcore >= totalpages)
    7928             :                 goto out;
    7929             : 
    7930             :         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
    7931           0 :         usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
    7932             : 
    7933             : restart:
    7934             :         /* Spread kernelcore memory as evenly as possible throughout nodes */
    7935           0 :         kernelcore_node = required_kernelcore / usable_nodes;
    7936           0 :         for_each_node_state(nid, N_MEMORY) {
    7937             :                 unsigned long start_pfn, end_pfn;
    7938             : 
    7939             :                 /*
    7940             :                  * Recalculate kernelcore_node if the division per node
    7941             :                  * now exceeds what is necessary to satisfy the requested
    7942             :                  * amount of memory for the kernel
    7943             :                  */
    7944           0 :                 if (required_kernelcore < kernelcore_node)
    7945           0 :                         kernelcore_node = required_kernelcore / usable_nodes;
    7946             : 
    7947             :                 /*
    7948             :                  * As the map is walked, we track how much memory is usable
    7949             :                  * by the kernel using kernelcore_remaining. When it is
    7950             :                  * 0, the rest of the node is usable by ZONE_MOVABLE
    7951             :                  */
    7952           0 :                 kernelcore_remaining = kernelcore_node;
    7953             : 
    7954             :                 /* Go through each range of PFNs within this node */
    7955           0 :                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    7956             :                         unsigned long size_pages;
    7957             : 
    7958           0 :                         start_pfn = max(start_pfn, zone_movable_pfn[nid]);
    7959           0 :                         if (start_pfn >= end_pfn)
    7960           0 :                                 continue;
    7961             : 
    7962             :                         /* Account for what is only usable for kernelcore */
    7963           0 :                         if (start_pfn < usable_startpfn) {
    7964             :                                 unsigned long kernel_pages;
    7965           0 :                                 kernel_pages = min(end_pfn, usable_startpfn)
    7966             :                                                                 - start_pfn;
    7967             : 
    7968           0 :                                 kernelcore_remaining -= min(kernel_pages,
    7969             :                                                         kernelcore_remaining);
    7970           0 :                                 required_kernelcore -= min(kernel_pages,
    7971             :                                                         required_kernelcore);
    7972             : 
    7973             :                                 /* Continue if range is now fully accounted */
    7974           0 :                                 if (end_pfn <= usable_startpfn) {
    7975             : 
    7976             :                                         /*
    7977             :                                          * Push zone_movable_pfn to the end so
    7978             :                                          * that if we have to rebalance
    7979             :                                          * kernelcore across nodes, we will
    7980             :                                          * not double account here
    7981             :                                          */
    7982           0 :                                         zone_movable_pfn[nid] = end_pfn;
    7983           0 :                                         continue;
    7984             :                                 }
    7985           0 :                                 start_pfn = usable_startpfn;
    7986             :                         }
    7987             : 
    7988             :                         /*
    7989             :                          * The usable PFN range for ZONE_MOVABLE is from
    7990             :                          * start_pfn->end_pfn. Calculate size_pages as the
    7991             :                          * number of pages used as kernelcore
    7992             :                          */
    7993           0 :                         size_pages = end_pfn - start_pfn;
    7994           0 :                         if (size_pages > kernelcore_remaining)
    7995           0 :                                 size_pages = kernelcore_remaining;
    7996           0 :                         zone_movable_pfn[nid] = start_pfn + size_pages;
    7997             : 
    7998             :                         /*
    7999             :                          * Some kernelcore has been met, update counts and
    8000             :                          * break if the kernelcore for this node has been
    8001             :                          * satisfied
    8002             :                          */
    8003           0 :                         required_kernelcore -= min(required_kernelcore,
    8004             :                                                                 size_pages);
    8005           0 :                         kernelcore_remaining -= size_pages;
    8006           0 :                         if (!kernelcore_remaining)
    8007             :                                 break;
    8008             :                 }
    8009             :         }
    8010             : 
    8011             :         /*
    8012             :          * If there is still required_kernelcore, we do another pass with one
    8013             :          * less node in the count. This will push zone_movable_pfn[nid] further
    8014             :          * along on the nodes that still have memory until kernelcore is
    8015             :          * satisfied
    8016             :          */
    8017           0 :         usable_nodes--;
    8018           0 :         if (usable_nodes && required_kernelcore > usable_nodes)
    8019             :                 goto restart;
    8020             : 
    8021             : out2:
    8022             :         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
    8023           0 :         for (nid = 0; nid < MAX_NUMNODES; nid++) {
    8024             :                 unsigned long start_pfn, end_pfn;
    8025             : 
    8026           0 :                 zone_movable_pfn[nid] =
    8027           0 :                         roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
    8028             : 
    8029           0 :                 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
    8030           0 :                 if (zone_movable_pfn[nid] >= end_pfn)
    8031           0 :                         zone_movable_pfn[nid] = 0;
    8032             :         }
    8033             : 
    8034             : out:
    8035             :         /* restore the node_state */
    8036           1 :         node_states[N_MEMORY] = saved_node_state;
    8037           1 : }
    8038             : 
    8039             : /* Any regular or high memory on that node ? */
    8040             : static void check_for_memory(pg_data_t *pgdat, int nid)
    8041             : {
    8042             :         enum zone_type zone_type;
    8043             : 
    8044           0 :         for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
    8045           1 :                 struct zone *zone = &pgdat->node_zones[zone_type];
    8046           1 :                 if (populated_zone(zone)) {
    8047             :                         if (IS_ENABLED(CONFIG_HIGHMEM))
    8048             :                                 node_set_state(nid, N_HIGH_MEMORY);
    8049             :                         if (zone_type <= ZONE_NORMAL)
    8050             :                                 node_set_state(nid, N_NORMAL_MEMORY);
    8051             :                         break;
    8052             :                 }
    8053             :         }
    8054             : }
    8055             : 
    8056             : /*
    8057             :  * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
    8058             :  * such cases we allow max_zone_pfn sorted in the descending order
    8059             :  */
    8060           1 : bool __weak arch_has_descending_max_zone_pfns(void)
    8061             : {
    8062           1 :         return false;
    8063             : }
    8064             : 
    8065             : /**
    8066             :  * free_area_init - Initialise all pg_data_t and zone data
    8067             :  * @max_zone_pfn: an array of max PFNs for each zone
    8068             :  *
    8069             :  * This will call free_area_init_node() for each active node in the system.
    8070             :  * Using the page ranges provided by memblock_set_node(), the size of each
    8071             :  * zone in each node and their holes is calculated. If the maximum PFN
    8072             :  * between two adjacent zones match, it is assumed that the zone is empty.
    8073             :  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
    8074             :  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
    8075             :  * starts where the previous one ended. For example, ZONE_DMA32 starts
    8076             :  * at arch_max_dma_pfn.
    8077             :  */
    8078           1 : void __init free_area_init(unsigned long *max_zone_pfn)
    8079             : {
    8080             :         unsigned long start_pfn, end_pfn;
    8081             :         int i, nid, zone;
    8082             :         bool descending;
    8083             : 
    8084             :         /* Record where the zone boundaries are */
    8085           1 :         memset(arch_zone_lowest_possible_pfn, 0,
    8086             :                                 sizeof(arch_zone_lowest_possible_pfn));
    8087           1 :         memset(arch_zone_highest_possible_pfn, 0,
    8088             :                                 sizeof(arch_zone_highest_possible_pfn));
    8089             : 
    8090           1 :         start_pfn = find_min_pfn_with_active_regions();
    8091           1 :         descending = arch_has_descending_max_zone_pfns();
    8092             : 
    8093           3 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    8094           2 :                 if (descending)
    8095           0 :                         zone = MAX_NR_ZONES - i - 1;
    8096             :                 else
    8097             :                         zone = i;
    8098             : 
    8099           2 :                 if (zone == ZONE_MOVABLE)
    8100           1 :                         continue;
    8101             : 
    8102           1 :                 end_pfn = max(max_zone_pfn[zone], start_pfn);
    8103           1 :                 arch_zone_lowest_possible_pfn[zone] = start_pfn;
    8104           1 :                 arch_zone_highest_possible_pfn[zone] = end_pfn;
    8105             : 
    8106           1 :                 start_pfn = end_pfn;
    8107             :         }
    8108             : 
    8109             :         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
    8110           1 :         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
    8111           1 :         find_zone_movable_pfns_for_nodes();
    8112             : 
    8113             :         /* Print out the zone ranges */
    8114           1 :         pr_info("Zone ranges:\n");
    8115           3 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    8116           2 :                 if (i == ZONE_MOVABLE)
    8117           1 :                         continue;
    8118           1 :                 pr_info("  %-8s ", zone_names[i]);
    8119           2 :                 if (arch_zone_lowest_possible_pfn[i] ==
    8120           1 :                                 arch_zone_highest_possible_pfn[i])
    8121           0 :                         pr_cont("empty\n");
    8122             :                 else
    8123           1 :                         pr_cont("[mem %#018Lx-%#018Lx]\n",
    8124             :                                 (u64)arch_zone_lowest_possible_pfn[i]
    8125             :                                         << PAGE_SHIFT,
    8126             :                                 ((u64)arch_zone_highest_possible_pfn[i]
    8127             :                                         << PAGE_SHIFT) - 1);
    8128             :         }
    8129             : 
    8130             :         /* Print out the PFNs ZONE_MOVABLE begins at in each node */
    8131           1 :         pr_info("Movable zone start for each node\n");
    8132           2 :         for (i = 0; i < MAX_NUMNODES; i++) {
    8133           1 :                 if (zone_movable_pfn[i])
    8134           0 :                         pr_info("  Node %d: %#018Lx\n", i,
    8135             :                                (u64)zone_movable_pfn[i] << PAGE_SHIFT);
    8136             :         }
    8137             : 
    8138             :         /*
    8139             :          * Print out the early node map, and initialize the
    8140             :          * subsection-map relative to active online memory ranges to
    8141             :          * enable future "sub-section" extensions of the memory map.
    8142             :          */
    8143           1 :         pr_info("Early memory node ranges\n");
    8144           2 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    8145           1 :                 pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
    8146             :                         (u64)start_pfn << PAGE_SHIFT,
    8147             :                         ((u64)end_pfn << PAGE_SHIFT) - 1);
    8148             :                 subsection_map_init(start_pfn, end_pfn - start_pfn);
    8149             :         }
    8150             : 
    8151             :         /* Initialise every node */
    8152           1 :         mminit_verify_pageflags_layout();
    8153             :         setup_nr_node_ids();
    8154           2 :         for_each_node(nid) {
    8155             :                 pg_data_t *pgdat;
    8156             : 
    8157           1 :                 if (!node_online(nid)) {
    8158             :                         pr_info("Initializing node %d as memoryless\n", nid);
    8159             : 
    8160             :                         /* Allocator not initialized yet */
    8161             :                         pgdat = arch_alloc_nodedata(nid);
    8162             :                         if (!pgdat) {
    8163             :                                 pr_err("Cannot allocate %zuB for node %d.\n",
    8164             :                                                 sizeof(*pgdat), nid);
    8165             :                                 continue;
    8166             :                         }
    8167             :                         arch_refresh_nodedata(nid, pgdat);
    8168             :                         free_area_init_memoryless_node(nid);
    8169             : 
    8170             :                         /*
    8171             :                          * We do not want to confuse userspace by sysfs
    8172             :                          * files/directories for node without any memory
    8173             :                          * attached to it, so this node is not marked as
    8174             :                          * N_MEMORY and not marked online so that no sysfs
    8175             :                          * hierarchy will be created via register_one_node for
    8176             :                          * it. The pgdat will get fully initialized by
    8177             :                          * hotadd_init_pgdat() when memory is hotplugged into
    8178             :                          * this node.
    8179             :                          */
    8180             :                         continue;
    8181             :                 }
    8182             : 
    8183           1 :                 pgdat = NODE_DATA(nid);
    8184           1 :                 free_area_init_node(nid);
    8185             : 
    8186             :                 /* Any memory on that node */
    8187             :                 if (pgdat->node_present_pages)
    8188             :                         node_set_state(nid, N_MEMORY);
    8189           2 :                 check_for_memory(pgdat, nid);
    8190             :         }
    8191             : 
    8192           1 :         memmap_init();
    8193           1 : }
    8194             : 
    8195           0 : static int __init cmdline_parse_core(char *p, unsigned long *core,
    8196             :                                      unsigned long *percent)
    8197             : {
    8198             :         unsigned long long coremem;
    8199             :         char *endptr;
    8200             : 
    8201           0 :         if (!p)
    8202             :                 return -EINVAL;
    8203             : 
    8204             :         /* Value may be a percentage of total memory, otherwise bytes */
    8205           0 :         coremem = simple_strtoull(p, &endptr, 0);
    8206           0 :         if (*endptr == '%') {
    8207             :                 /* Paranoid check for percent values greater than 100 */
    8208           0 :                 WARN_ON(coremem > 100);
    8209             : 
    8210           0 :                 *percent = coremem;
    8211             :         } else {
    8212           0 :                 coremem = memparse(p, &p);
    8213             :                 /* Paranoid check that UL is enough for the coremem value */
    8214           0 :                 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
    8215             : 
    8216           0 :                 *core = coremem >> PAGE_SHIFT;
    8217           0 :                 *percent = 0UL;
    8218             :         }
    8219             :         return 0;
    8220             : }
    8221             : 
    8222             : /*
    8223             :  * kernelcore=size sets the amount of memory for use for allocations that
    8224             :  * cannot be reclaimed or migrated.
    8225             :  */
    8226           0 : static int __init cmdline_parse_kernelcore(char *p)
    8227             : {
    8228             :         /* parse kernelcore=mirror */
    8229           0 :         if (parse_option_str(p, "mirror")) {
    8230           0 :                 mirrored_kernelcore = true;
    8231           0 :                 return 0;
    8232             :         }
    8233             : 
    8234           0 :         return cmdline_parse_core(p, &required_kernelcore,
    8235             :                                   &required_kernelcore_percent);
    8236             : }
    8237             : 
    8238             : /*
    8239             :  * movablecore=size sets the amount of memory for use for allocations that
    8240             :  * can be reclaimed or migrated.
    8241             :  */
    8242           0 : static int __init cmdline_parse_movablecore(char *p)
    8243             : {
    8244           0 :         return cmdline_parse_core(p, &required_movablecore,
    8245             :                                   &required_movablecore_percent);
    8246             : }
    8247             : 
    8248             : early_param("kernelcore", cmdline_parse_kernelcore);
    8249             : early_param("movablecore", cmdline_parse_movablecore);
    8250             : 
    8251           0 : void adjust_managed_page_count(struct page *page, long count)
    8252             : {
    8253           0 :         atomic_long_add(count, &page_zone(page)->managed_pages);
    8254           0 :         totalram_pages_add(count);
    8255             : #ifdef CONFIG_HIGHMEM
    8256             :         if (PageHighMem(page))
    8257             :                 totalhigh_pages_add(count);
    8258             : #endif
    8259           0 : }
    8260             : EXPORT_SYMBOL(adjust_managed_page_count);
    8261             : 
    8262           0 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
    8263             : {
    8264             :         void *pos;
    8265           0 :         unsigned long pages = 0;
    8266             : 
    8267           0 :         start = (void *)PAGE_ALIGN((unsigned long)start);
    8268           0 :         end = (void *)((unsigned long)end & PAGE_MASK);
    8269           0 :         for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
    8270           0 :                 struct page *page = virt_to_page(pos);
    8271             :                 void *direct_map_addr;
    8272             : 
    8273             :                 /*
    8274             :                  * 'direct_map_addr' might be different from 'pos'
    8275             :                  * because some architectures' virt_to_page()
    8276             :                  * work with aliases.  Getting the direct map
    8277             :                  * address ensures that we get a _writeable_
    8278             :                  * alias for the memset().
    8279             :                  */
    8280           0 :                 direct_map_addr = page_address(page);
    8281             :                 /*
    8282             :                  * Perform a kasan-unchecked memset() since this memory
    8283             :                  * has not been initialized.
    8284             :                  */
    8285           0 :                 direct_map_addr = kasan_reset_tag(direct_map_addr);
    8286           0 :                 if ((unsigned int)poison <= 0xFF)
    8287           0 :                         memset(direct_map_addr, poison, PAGE_SIZE);
    8288             : 
    8289           0 :                 free_reserved_page(page);
    8290             :         }
    8291             : 
    8292           0 :         if (pages && s)
    8293           0 :                 pr_info("Freeing %s memory: %ldK\n", s, K(pages));
    8294             : 
    8295           0 :         return pages;
    8296             : }
    8297             : 
    8298           1 : void __init mem_init_print_info(void)
    8299             : {
    8300             :         unsigned long physpages, codesize, datasize, rosize, bss_size;
    8301             :         unsigned long init_code_size, init_data_size;
    8302             : 
    8303           1 :         physpages = get_num_physpages();
    8304           1 :         codesize = _etext - _stext;
    8305           1 :         datasize = _edata - _sdata;
    8306           1 :         rosize = __end_rodata - __start_rodata;
    8307           1 :         bss_size = __bss_stop - __bss_start;
    8308           1 :         init_data_size = __init_end - __init_begin;
    8309           1 :         init_code_size = _einittext - _sinittext;
    8310             : 
    8311             :         /*
    8312             :          * Detect special cases and adjust section sizes accordingly:
    8313             :          * 1) .init.* may be embedded into .data sections
    8314             :          * 2) .init.text.* may be out of [__init_begin, __init_end],
    8315             :          *    please refer to arch/tile/kernel/vmlinux.lds.S.
    8316             :          * 3) .rodata.* may be embedded into .text or .data sections.
    8317             :          */
    8318             : #define adj_init_size(start, end, size, pos, adj) \
    8319             :         do { \
    8320             :                 if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
    8321             :                         size -= adj; \
    8322             :         } while (0)
    8323             : 
    8324           1 :         adj_init_size(__init_begin, __init_end, init_data_size,
    8325             :                      _sinittext, init_code_size);
    8326           1 :         adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
    8327           1 :         adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
    8328           1 :         adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
    8329           1 :         adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
    8330             : 
    8331             : #undef  adj_init_size
    8332             : 
    8333           3 :         pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
    8334             : #ifdef  CONFIG_HIGHMEM
    8335             :                 ", %luK highmem"
    8336             : #endif
    8337             :                 ")\n",
    8338             :                 K(nr_free_pages()), K(physpages),
    8339             :                 codesize >> 10, datasize >> 10, rosize >> 10,
    8340             :                 (init_data_size + init_code_size) >> 10, bss_size >> 10,
    8341             :                 K(physpages - totalram_pages() - totalcma_pages),
    8342             :                 K(totalcma_pages)
    8343             : #ifdef  CONFIG_HIGHMEM
    8344             :                 , K(totalhigh_pages())
    8345             : #endif
    8346             :                 );
    8347           1 : }
    8348             : 
    8349             : /**
    8350             :  * set_dma_reserve - set the specified number of pages reserved in the first zone
    8351             :  * @new_dma_reserve: The number of pages to mark reserved
    8352             :  *
    8353             :  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
    8354             :  * In the DMA zone, a significant percentage may be consumed by kernel image
    8355             :  * and other unfreeable allocations which can skew the watermarks badly. This
    8356             :  * function may optionally be used to account for unfreeable pages in the
    8357             :  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
    8358             :  * smaller per-cpu batchsize.
    8359             :  */
    8360           0 : void __init set_dma_reserve(unsigned long new_dma_reserve)
    8361             : {
    8362           0 :         dma_reserve = new_dma_reserve;
    8363           0 : }
    8364             : 
    8365           0 : static int page_alloc_cpu_dead(unsigned int cpu)
    8366             : {
    8367             :         struct zone *zone;
    8368             : 
    8369           0 :         lru_add_drain_cpu(cpu);
    8370           0 :         mlock_page_drain_remote(cpu);
    8371           0 :         drain_pages(cpu);
    8372             : 
    8373             :         /*
    8374             :          * Spill the event counters of the dead processor
    8375             :          * into the current processors event counters.
    8376             :          * This artificially elevates the count of the current
    8377             :          * processor.
    8378             :          */
    8379           0 :         vm_events_fold_cpu(cpu);
    8380             : 
    8381             :         /*
    8382             :          * Zero the differential counters of the dead processor
    8383             :          * so that the vm statistics are consistent.
    8384             :          *
    8385             :          * This is only okay since the processor is dead and cannot
    8386             :          * race with what we are doing.
    8387             :          */
    8388           0 :         cpu_vm_stats_fold(cpu);
    8389             : 
    8390           0 :         for_each_populated_zone(zone)
    8391           0 :                 zone_pcp_update(zone, 0);
    8392             : 
    8393           0 :         return 0;
    8394             : }
    8395             : 
    8396           0 : static int page_alloc_cpu_online(unsigned int cpu)
    8397             : {
    8398             :         struct zone *zone;
    8399             : 
    8400           0 :         for_each_populated_zone(zone)
    8401           0 :                 zone_pcp_update(zone, 1);
    8402           0 :         return 0;
    8403             : }
    8404             : 
    8405             : #ifdef CONFIG_NUMA
    8406             : int hashdist = HASHDIST_DEFAULT;
    8407             : 
    8408             : static int __init set_hashdist(char *str)
    8409             : {
    8410             :         if (!str)
    8411             :                 return 0;
    8412             :         hashdist = simple_strtoul(str, &str, 0);
    8413             :         return 1;
    8414             : }
    8415             : __setup("hashdist=", set_hashdist);
    8416             : #endif
    8417             : 
    8418           1 : void __init page_alloc_init(void)
    8419             : {
    8420             :         int ret;
    8421             : 
    8422             : #ifdef CONFIG_NUMA
    8423             :         if (num_node_state(N_MEMORY) == 1)
    8424             :                 hashdist = 0;
    8425             : #endif
    8426             : 
    8427           1 :         ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
    8428             :                                         "mm/page_alloc:pcp",
    8429             :                                         page_alloc_cpu_online,
    8430             :                                         page_alloc_cpu_dead);
    8431           1 :         WARN_ON(ret < 0);
    8432           1 : }
    8433             : 
    8434             : /*
    8435             :  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
    8436             :  *      or min_free_kbytes changes.
    8437             :  */
    8438           2 : static void calculate_totalreserve_pages(void)
    8439             : {
    8440             :         struct pglist_data *pgdat;
    8441           2 :         unsigned long reserve_pages = 0;
    8442             :         enum zone_type i, j;
    8443             : 
    8444           4 :         for_each_online_pgdat(pgdat) {
    8445             : 
    8446           2 :                 pgdat->totalreserve_pages = 0;
    8447             : 
    8448           6 :                 for (i = 0; i < MAX_NR_ZONES; i++) {
    8449           4 :                         struct zone *zone = pgdat->node_zones + i;
    8450           4 :                         long max = 0;
    8451           4 :                         unsigned long managed_pages = zone_managed_pages(zone);
    8452             : 
    8453             :                         /* Find valid and maximum lowmem_reserve in the zone */
    8454          10 :                         for (j = i; j < MAX_NR_ZONES; j++) {
    8455           6 :                                 if (zone->lowmem_reserve[j] > max)
    8456           0 :                                         max = zone->lowmem_reserve[j];
    8457             :                         }
    8458             : 
    8459             :                         /* we treat the high watermark as reserved pages. */
    8460           4 :                         max += high_wmark_pages(zone);
    8461             : 
    8462           4 :                         if (max > managed_pages)
    8463           0 :                                 max = managed_pages;
    8464             : 
    8465           4 :                         pgdat->totalreserve_pages += max;
    8466             : 
    8467           4 :                         reserve_pages += max;
    8468             :                 }
    8469             :         }
    8470           2 :         totalreserve_pages = reserve_pages;
    8471           2 : }
    8472             : 
    8473             : /*
    8474             :  * setup_per_zone_lowmem_reserve - called whenever
    8475             :  *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
    8476             :  *      has a correct pages reserved value, so an adequate number of
    8477             :  *      pages are left in the zone after a successful __alloc_pages().
    8478             :  */
    8479           1 : static void setup_per_zone_lowmem_reserve(void)
    8480             : {
    8481             :         struct pglist_data *pgdat;
    8482             :         enum zone_type i, j;
    8483             : 
    8484           2 :         for_each_online_pgdat(pgdat) {
    8485           2 :                 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
    8486           1 :                         struct zone *zone = &pgdat->node_zones[i];
    8487           1 :                         int ratio = sysctl_lowmem_reserve_ratio[i];
    8488           2 :                         bool clear = !ratio || !zone_managed_pages(zone);
    8489           1 :                         unsigned long managed_pages = 0;
    8490             : 
    8491           2 :                         for (j = i + 1; j < MAX_NR_ZONES; j++) {
    8492           1 :                                 struct zone *upper_zone = &pgdat->node_zones[j];
    8493             : 
    8494           1 :                                 managed_pages += zone_managed_pages(upper_zone);
    8495             : 
    8496           1 :                                 if (clear)
    8497           0 :                                         zone->lowmem_reserve[j] = 0;
    8498             :                                 else
    8499           1 :                                         zone->lowmem_reserve[j] = managed_pages / ratio;
    8500             :                         }
    8501             :                 }
    8502             :         }
    8503             : 
    8504             :         /* update totalreserve_pages */
    8505           1 :         calculate_totalreserve_pages();
    8506           1 : }
    8507             : 
    8508           1 : static void __setup_per_zone_wmarks(void)
    8509             : {
    8510           1 :         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    8511           1 :         unsigned long lowmem_pages = 0;
    8512             :         struct zone *zone;
    8513             :         unsigned long flags;
    8514             : 
    8515             :         /* Calculate total number of !ZONE_HIGHMEM pages */
    8516           3 :         for_each_zone(zone) {
    8517           2 :                 if (!is_highmem(zone))
    8518           2 :                         lowmem_pages += zone_managed_pages(zone);
    8519             :         }
    8520             : 
    8521           3 :         for_each_zone(zone) {
    8522             :                 u64 tmp;
    8523             : 
    8524           2 :                 spin_lock_irqsave(&zone->lock, flags);
    8525           2 :                 tmp = (u64)pages_min * zone_managed_pages(zone);
    8526           2 :                 do_div(tmp, lowmem_pages);
    8527           2 :                 if (is_highmem(zone)) {
    8528             :                         /*
    8529             :                          * __GFP_HIGH and PF_MEMALLOC allocations usually don't
    8530             :                          * need highmem pages, so cap pages_min to a small
    8531             :                          * value here.
    8532             :                          *
    8533             :                          * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
    8534             :                          * deltas control async page reclaim, and so should
    8535             :                          * not be capped for highmem.
    8536             :                          */
    8537             :                         unsigned long min_pages;
    8538             : 
    8539             :                         min_pages = zone_managed_pages(zone) / 1024;
    8540             :                         min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
    8541             :                         zone->_watermark[WMARK_MIN] = min_pages;
    8542             :                 } else {
    8543             :                         /*
    8544             :                          * If it's a lowmem zone, reserve a number of pages
    8545             :                          * proportionate to the zone's size.
    8546             :                          */
    8547           2 :                         zone->_watermark[WMARK_MIN] = tmp;
    8548             :                 }
    8549             : 
    8550             :                 /*
    8551             :                  * Set the kswapd watermarks distance according to the
    8552             :                  * scale factor in proportion to available memory, but
    8553             :                  * ensure a minimum size on small systems.
    8554             :                  */
    8555           6 :                 tmp = max_t(u64, tmp >> 2,
    8556             :                             mult_frac(zone_managed_pages(zone),
    8557             :                                       watermark_scale_factor, 10000));
    8558             : 
    8559           2 :                 zone->watermark_boost = 0;
    8560           2 :                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
    8561           2 :                 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
    8562           2 :                 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
    8563             : 
    8564           4 :                 spin_unlock_irqrestore(&zone->lock, flags);
    8565             :         }
    8566             : 
    8567             :         /* update totalreserve_pages */
    8568           1 :         calculate_totalreserve_pages();
    8569           1 : }
    8570             : 
    8571             : /**
    8572             :  * setup_per_zone_wmarks - called when min_free_kbytes changes
    8573             :  * or when memory is hot-{added|removed}
    8574             :  *
    8575             :  * Ensures that the watermark[min,low,high] values for each zone are set
    8576             :  * correctly with respect to min_free_kbytes.
    8577             :  */
    8578           1 : void setup_per_zone_wmarks(void)
    8579             : {
    8580             :         struct zone *zone;
    8581             :         static DEFINE_SPINLOCK(lock);
    8582             : 
    8583           1 :         spin_lock(&lock);
    8584           1 :         __setup_per_zone_wmarks();
    8585           1 :         spin_unlock(&lock);
    8586             : 
    8587             :         /*
    8588             :          * The watermark size have changed so update the pcpu batch
    8589             :          * and high limits or the limits may be inappropriate.
    8590             :          */
    8591           3 :         for_each_zone(zone)
    8592           2 :                 zone_pcp_update(zone, 0);
    8593           1 : }
    8594             : 
    8595             : /*
    8596             :  * Initialise min_free_kbytes.
    8597             :  *
    8598             :  * For small machines we want it small (128k min).  For large machines
    8599             :  * we want it large (256MB max).  But it is not linear, because network
    8600             :  * bandwidth does not increase linearly with machine size.  We use
    8601             :  *
    8602             :  *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
    8603             :  *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
    8604             :  *
    8605             :  * which yields
    8606             :  *
    8607             :  * 16MB:        512k
    8608             :  * 32MB:        724k
    8609             :  * 64MB:        1024k
    8610             :  * 128MB:       1448k
    8611             :  * 256MB:       2048k
    8612             :  * 512MB:       2896k
    8613             :  * 1024MB:      4096k
    8614             :  * 2048MB:      5792k
    8615             :  * 4096MB:      8192k
    8616             :  * 8192MB:      11584k
    8617             :  * 16384MB:     16384k
    8618             :  */
    8619           1 : void calculate_min_free_kbytes(void)
    8620             : {
    8621             :         unsigned long lowmem_kbytes;
    8622             :         int new_min_free_kbytes;
    8623             : 
    8624           1 :         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
    8625           1 :         new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
    8626             : 
    8627           1 :         if (new_min_free_kbytes > user_min_free_kbytes)
    8628           1 :                 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
    8629             :         else
    8630           0 :                 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
    8631             :                                 new_min_free_kbytes, user_min_free_kbytes);
    8632             : 
    8633           1 : }
    8634             : 
    8635           1 : int __meminit init_per_zone_wmark_min(void)
    8636             : {
    8637           1 :         calculate_min_free_kbytes();
    8638           1 :         setup_per_zone_wmarks();
    8639             :         refresh_zone_stat_thresholds();
    8640           1 :         setup_per_zone_lowmem_reserve();
    8641             : 
    8642             : #ifdef CONFIG_NUMA
    8643             :         setup_min_unmapped_ratio();
    8644             :         setup_min_slab_ratio();
    8645             : #endif
    8646             : 
    8647             :         khugepaged_min_free_kbytes_update();
    8648             : 
    8649           1 :         return 0;
    8650             : }
    8651             : postcore_initcall(init_per_zone_wmark_min)
    8652             : 
    8653             : /*
    8654             :  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
    8655             :  *      that we can call two helper functions whenever min_free_kbytes
    8656             :  *      changes.
    8657             :  */
    8658           0 : int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
    8659             :                 void *buffer, size_t *length, loff_t *ppos)
    8660             : {
    8661             :         int rc;
    8662             : 
    8663           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8664           0 :         if (rc)
    8665             :                 return rc;
    8666             : 
    8667           0 :         if (write) {
    8668           0 :                 user_min_free_kbytes = min_free_kbytes;
    8669           0 :                 setup_per_zone_wmarks();
    8670             :         }
    8671             :         return 0;
    8672             : }
    8673             : 
    8674           0 : int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
    8675             :                 void *buffer, size_t *length, loff_t *ppos)
    8676             : {
    8677             :         int rc;
    8678             : 
    8679           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8680           0 :         if (rc)
    8681             :                 return rc;
    8682             : 
    8683           0 :         if (write)
    8684           0 :                 setup_per_zone_wmarks();
    8685             : 
    8686             :         return 0;
    8687             : }
    8688             : 
    8689             : #ifdef CONFIG_NUMA
    8690             : static void setup_min_unmapped_ratio(void)
    8691             : {
    8692             :         pg_data_t *pgdat;
    8693             :         struct zone *zone;
    8694             : 
    8695             :         for_each_online_pgdat(pgdat)
    8696             :                 pgdat->min_unmapped_pages = 0;
    8697             : 
    8698             :         for_each_zone(zone)
    8699             :                 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
    8700             :                                                          sysctl_min_unmapped_ratio) / 100;
    8701             : }
    8702             : 
    8703             : 
    8704             : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
    8705             :                 void *buffer, size_t *length, loff_t *ppos)
    8706             : {
    8707             :         int rc;
    8708             : 
    8709             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8710             :         if (rc)
    8711             :                 return rc;
    8712             : 
    8713             :         setup_min_unmapped_ratio();
    8714             : 
    8715             :         return 0;
    8716             : }
    8717             : 
    8718             : static void setup_min_slab_ratio(void)
    8719             : {
    8720             :         pg_data_t *pgdat;
    8721             :         struct zone *zone;
    8722             : 
    8723             :         for_each_online_pgdat(pgdat)
    8724             :                 pgdat->min_slab_pages = 0;
    8725             : 
    8726             :         for_each_zone(zone)
    8727             :                 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
    8728             :                                                      sysctl_min_slab_ratio) / 100;
    8729             : }
    8730             : 
    8731             : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
    8732             :                 void *buffer, size_t *length, loff_t *ppos)
    8733             : {
    8734             :         int rc;
    8735             : 
    8736             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8737             :         if (rc)
    8738             :                 return rc;
    8739             : 
    8740             :         setup_min_slab_ratio();
    8741             : 
    8742             :         return 0;
    8743             : }
    8744             : #endif
    8745             : 
    8746             : /*
    8747             :  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
    8748             :  *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
    8749             :  *      whenever sysctl_lowmem_reserve_ratio changes.
    8750             :  *
    8751             :  * The reserve ratio obviously has absolutely no relation with the
    8752             :  * minimum watermarks. The lowmem reserve ratio can only make sense
    8753             :  * if in function of the boot time zone sizes.
    8754             :  */
    8755           0 : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
    8756             :                 void *buffer, size_t *length, loff_t *ppos)
    8757             : {
    8758             :         int i;
    8759             : 
    8760           0 :         proc_dointvec_minmax(table, write, buffer, length, ppos);
    8761             : 
    8762           0 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    8763           0 :                 if (sysctl_lowmem_reserve_ratio[i] < 1)
    8764           0 :                         sysctl_lowmem_reserve_ratio[i] = 0;
    8765             :         }
    8766             : 
    8767           0 :         setup_per_zone_lowmem_reserve();
    8768           0 :         return 0;
    8769             : }
    8770             : 
    8771             : /*
    8772             :  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
    8773             :  * cpu. It is the fraction of total pages in each zone that a hot per cpu
    8774             :  * pagelist can have before it gets flushed back to buddy allocator.
    8775             :  */
    8776           0 : int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
    8777             :                 int write, void *buffer, size_t *length, loff_t *ppos)
    8778             : {
    8779             :         struct zone *zone;
    8780             :         int old_percpu_pagelist_high_fraction;
    8781             :         int ret;
    8782             : 
    8783           0 :         mutex_lock(&pcp_batch_high_lock);
    8784           0 :         old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
    8785             : 
    8786           0 :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8787           0 :         if (!write || ret < 0)
    8788             :                 goto out;
    8789             : 
    8790             :         /* Sanity checking to avoid pcp imbalance */
    8791           0 :         if (percpu_pagelist_high_fraction &&
    8792             :             percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
    8793           0 :                 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
    8794           0 :                 ret = -EINVAL;
    8795           0 :                 goto out;
    8796             :         }
    8797             : 
    8798             :         /* No change? */
    8799           0 :         if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
    8800             :                 goto out;
    8801             : 
    8802           0 :         for_each_populated_zone(zone)
    8803           0 :                 zone_set_pageset_high_and_batch(zone, 0);
    8804             : out:
    8805           0 :         mutex_unlock(&pcp_batch_high_lock);
    8806           0 :         return ret;
    8807             : }
    8808             : 
    8809             : #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
    8810             : /*
    8811             :  * Returns the number of pages that arch has reserved but
    8812             :  * is not known to alloc_large_system_hash().
    8813             :  */
    8814             : static unsigned long __init arch_reserved_kernel_pages(void)
    8815             : {
    8816             :         return 0;
    8817             : }
    8818             : #endif
    8819             : 
    8820             : /*
    8821             :  * Adaptive scale is meant to reduce sizes of hash tables on large memory
    8822             :  * machines. As memory size is increased the scale is also increased but at
    8823             :  * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
    8824             :  * quadruples the scale is increased by one, which means the size of hash table
    8825             :  * only doubles, instead of quadrupling as well.
    8826             :  * Because 32-bit systems cannot have large physical memory, where this scaling
    8827             :  * makes sense, it is disabled on such platforms.
    8828             :  */
    8829             : #if __BITS_PER_LONG > 32
    8830             : #define ADAPT_SCALE_BASE        (64ul << 30)
    8831             : #define ADAPT_SCALE_SHIFT       2
    8832             : #define ADAPT_SCALE_NPAGES      (ADAPT_SCALE_BASE >> PAGE_SHIFT)
    8833             : #endif
    8834             : 
    8835             : /*
    8836             :  * allocate a large system hash table from bootmem
    8837             :  * - it is assumed that the hash table must contain an exact power-of-2
    8838             :  *   quantity of entries
    8839             :  * - limit is the number of hash buckets, not the total allocation size
    8840             :  */
    8841           5 : void *__init alloc_large_system_hash(const char *tablename,
    8842             :                                      unsigned long bucketsize,
    8843             :                                      unsigned long numentries,
    8844             :                                      int scale,
    8845             :                                      int flags,
    8846             :                                      unsigned int *_hash_shift,
    8847             :                                      unsigned int *_hash_mask,
    8848             :                                      unsigned long low_limit,
    8849             :                                      unsigned long high_limit)
    8850             : {
    8851           5 :         unsigned long long max = high_limit;
    8852             :         unsigned long log2qty, size;
    8853           5 :         void *table = NULL;
    8854             :         gfp_t gfp_flags;
    8855             :         bool virt;
    8856             :         bool huge;
    8857             : 
    8858             :         /* allow the kernel cmdline to have a say */
    8859           5 :         if (!numentries) {
    8860             :                 /* round applicable memory size up to nearest megabyte */
    8861           4 :                 numentries = nr_kernel_pages;
    8862           4 :                 numentries -= arch_reserved_kernel_pages();
    8863             : 
    8864             :                 /* It isn't necessary when PAGE_SIZE >= 1MB */
    8865             :                 if (PAGE_SHIFT < 20)
    8866           4 :                         numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
    8867             : 
    8868             : #if __BITS_PER_LONG > 32
    8869           4 :                 if (!high_limit) {
    8870             :                         unsigned long adapt;
    8871             : 
    8872           4 :                         for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
    8873           0 :                              adapt <<= ADAPT_SCALE_SHIFT)
    8874           0 :                                 scale++;
    8875             :                 }
    8876             : #endif
    8877             : 
    8878             :                 /* limit to 1 bucket per 2^scale bytes of low memory */
    8879           4 :                 if (scale > PAGE_SHIFT)
    8880           4 :                         numentries >>= (scale - PAGE_SHIFT);
    8881             :                 else
    8882           0 :                         numentries <<= (PAGE_SHIFT - scale);
    8883             : 
    8884             :                 /* Make sure we've got at least a 0-order allocation.. */
    8885           4 :                 if (unlikely(flags & HASH_SMALL)) {
    8886             :                         /* Makes no sense without HASH_EARLY */
    8887           0 :                         WARN_ON(!(flags & HASH_EARLY));
    8888           0 :                         if (!(numentries >> *_hash_shift)) {
    8889           0 :                                 numentries = 1UL << *_hash_shift;
    8890           0 :                                 BUG_ON(!numentries);
    8891             :                         }
    8892           4 :                 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
    8893           0 :                         numentries = PAGE_SIZE / bucketsize;
    8894             :         }
    8895          10 :         numentries = roundup_pow_of_two(numentries);
    8896             : 
    8897             :         /* limit allocation size to 1/16 total memory by default */
    8898           5 :         if (max == 0) {
    8899           4 :                 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
    8900           4 :                 do_div(max, bucketsize);
    8901             :         }
    8902           5 :         max = min(max, 0x80000000ULL);
    8903             : 
    8904           5 :         if (numentries < low_limit)
    8905           0 :                 numentries = low_limit;
    8906           5 :         if (numentries > max)
    8907           0 :                 numentries = max;
    8908             : 
    8909          10 :         log2qty = ilog2(numentries);
    8910             : 
    8911           5 :         gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
    8912             :         do {
    8913           5 :                 virt = false;
    8914           5 :                 size = bucketsize << log2qty;
    8915           5 :                 if (flags & HASH_EARLY) {
    8916           2 :                         if (flags & HASH_ZERO)
    8917           2 :                                 table = memblock_alloc(size, SMP_CACHE_BYTES);
    8918             :                         else
    8919           0 :                                 table = memblock_alloc_raw(size,
    8920             :                                                            SMP_CACHE_BYTES);
    8921           3 :                 } else if (get_order(size) >= MAX_ORDER || hashdist) {
    8922           0 :                         table = vmalloc_huge(size, gfp_flags);
    8923           0 :                         virt = true;
    8924             :                         if (table)
    8925             :                                 huge = is_vm_area_hugepages(table);
    8926             :                 } else {
    8927             :                         /*
    8928             :                          * If bucketsize is not a power-of-two, we may free
    8929             :                          * some pages at the end of hash table which
    8930             :                          * alloc_pages_exact() automatically does
    8931             :                          */
    8932           3 :                         table = alloc_pages_exact(size, gfp_flags);
    8933           3 :                         kmemleak_alloc(table, size, 1, gfp_flags);
    8934             :                 }
    8935           5 :         } while (!table && size > PAGE_SIZE && --log2qty);
    8936             : 
    8937           5 :         if (!table)
    8938           0 :                 panic("Failed to allocate %s hash table\n", tablename);
    8939             : 
    8940          10 :         pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
    8941             :                 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
    8942             :                 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
    8943             : 
    8944           5 :         if (_hash_shift)
    8945           5 :                 *_hash_shift = log2qty;
    8946           5 :         if (_hash_mask)
    8947           3 :                 *_hash_mask = (1 << log2qty) - 1;
    8948             : 
    8949           5 :         return table;
    8950             : }
    8951             : 
    8952             : /*
    8953             :  * This function checks whether pageblock includes unmovable pages or not.
    8954             :  *
    8955             :  * PageLRU check without isolation or lru_lock could race so that
    8956             :  * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
    8957             :  * check without lock_page also may miss some movable non-lru pages at
    8958             :  * race condition. So you can't expect this function should be exact.
    8959             :  *
    8960             :  * Returns a page without holding a reference. If the caller wants to
    8961             :  * dereference that page (e.g., dumping), it has to make sure that it
    8962             :  * cannot get removed (e.g., via memory unplug) concurrently.
    8963             :  *
    8964             :  */
    8965           0 : struct page *has_unmovable_pages(struct zone *zone, struct page *page,
    8966             :                                  int migratetype, int flags)
    8967             : {
    8968           0 :         unsigned long iter = 0;
    8969           0 :         unsigned long pfn = page_to_pfn(page);
    8970           0 :         unsigned long offset = pfn % pageblock_nr_pages;
    8971             : 
    8972             :         if (is_migrate_cma_page(page)) {
    8973             :                 /*
    8974             :                  * CMA allocations (alloc_contig_range) really need to mark
    8975             :                  * isolate CMA pageblocks even when they are not movable in fact
    8976             :                  * so consider them movable here.
    8977             :                  */
    8978             :                 if (is_migrate_cma(migratetype))
    8979             :                         return NULL;
    8980             : 
    8981             :                 return page;
    8982             :         }
    8983             : 
    8984           0 :         for (; iter < pageblock_nr_pages - offset; iter++) {
    8985           0 :                 page = pfn_to_page(pfn + iter);
    8986             : 
    8987             :                 /*
    8988             :                  * Both, bootmem allocations and memory holes are marked
    8989             :                  * PG_reserved and are unmovable. We can even have unmovable
    8990             :                  * allocations inside ZONE_MOVABLE, for example when
    8991             :                  * specifying "movablecore".
    8992             :                  */
    8993           0 :                 if (PageReserved(page))
    8994             :                         return page;
    8995             : 
    8996             :                 /*
    8997             :                  * If the zone is movable and we have ruled out all reserved
    8998             :                  * pages then it should be reasonably safe to assume the rest
    8999             :                  * is movable.
    9000             :                  */
    9001           0 :                 if (zone_idx(zone) == ZONE_MOVABLE)
    9002           0 :                         continue;
    9003             : 
    9004             :                 /*
    9005             :                  * Hugepages are not in LRU lists, but they're movable.
    9006             :                  * THPs are on the LRU, but need to be counted as #small pages.
    9007             :                  * We need not scan over tail pages because we don't
    9008             :                  * handle each tail page individually in migration.
    9009             :                  */
    9010           0 :                 if (PageHuge(page) || PageTransCompound(page)) {
    9011             :                         struct page *head = compound_head(page);
    9012             :                         unsigned int skip_pages;
    9013             : 
    9014             :                         if (PageHuge(page)) {
    9015             :                                 if (!hugepage_migration_supported(page_hstate(head)))
    9016             :                                         return page;
    9017             :                         } else if (!PageLRU(head) && !__PageMovable(head)) {
    9018             :                                 return page;
    9019             :                         }
    9020             : 
    9021             :                         skip_pages = compound_nr(head) - (page - head);
    9022             :                         iter += skip_pages - 1;
    9023             :                         continue;
    9024             :                 }
    9025             : 
    9026             :                 /*
    9027             :                  * We can't use page_count without pin a page
    9028             :                  * because another CPU can free compound page.
    9029             :                  * This check already skips compound tails of THP
    9030             :                  * because their page->_refcount is zero at all time.
    9031             :                  */
    9032           0 :                 if (!page_ref_count(page)) {
    9033           0 :                         if (PageBuddy(page))
    9034           0 :                                 iter += (1 << buddy_order(page)) - 1;
    9035           0 :                         continue;
    9036             :                 }
    9037             : 
    9038             :                 /*
    9039             :                  * The HWPoisoned page may be not in buddy system, and
    9040             :                  * page_count() is not 0.
    9041             :                  */
    9042           0 :                 if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
    9043             :                         continue;
    9044             : 
    9045             :                 /*
    9046             :                  * We treat all PageOffline() pages as movable when offlining
    9047             :                  * to give drivers a chance to decrement their reference count
    9048             :                  * in MEM_GOING_OFFLINE in order to indicate that these pages
    9049             :                  * can be offlined as there are no direct references anymore.
    9050             :                  * For actually unmovable PageOffline() where the driver does
    9051             :                  * not support this, we will fail later when trying to actually
    9052             :                  * move these pages that still have a reference count > 0.
    9053             :                  * (false negatives in this function only)
    9054             :                  */
    9055           0 :                 if ((flags & MEMORY_OFFLINE) && PageOffline(page))
    9056           0 :                         continue;
    9057             : 
    9058           0 :                 if (__PageMovable(page) || PageLRU(page))
    9059           0 :                         continue;
    9060             : 
    9061             :                 /*
    9062             :                  * If there are RECLAIMABLE pages, we need to check
    9063             :                  * it.  But now, memory offline itself doesn't call
    9064             :                  * shrink_node_slabs() and it still to be fixed.
    9065             :                  */
    9066             :                 return page;
    9067             :         }
    9068             :         return NULL;
    9069             : }
    9070             : 
    9071             : #ifdef CONFIG_CONTIG_ALLOC
    9072             : static unsigned long pfn_max_align_down(unsigned long pfn)
    9073             : {
    9074             :         return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
    9075             : }
    9076             : 
    9077             : static unsigned long pfn_max_align_up(unsigned long pfn)
    9078             : {
    9079             :         return ALIGN(pfn, MAX_ORDER_NR_PAGES);
    9080             : }
    9081             : 
    9082             : #if defined(CONFIG_DYNAMIC_DEBUG) || \
    9083             :         (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
    9084             : /* Usage: See admin-guide/dynamic-debug-howto.rst */
    9085             : static void alloc_contig_dump_pages(struct list_head *page_list)
    9086             : {
    9087             :         DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
    9088             : 
    9089             :         if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
    9090             :                 struct page *page;
    9091             : 
    9092             :                 dump_stack();
    9093             :                 list_for_each_entry(page, page_list, lru)
    9094             :                         dump_page(page, "migration failure");
    9095             :         }
    9096             : }
    9097             : #else
    9098             : static inline void alloc_contig_dump_pages(struct list_head *page_list)
    9099             : {
    9100             : }
    9101             : #endif
    9102             : 
    9103             : /* [start, end) must belong to a single zone. */
    9104             : static int __alloc_contig_migrate_range(struct compact_control *cc,
    9105             :                                         unsigned long start, unsigned long end)
    9106             : {
    9107             :         /* This function is based on compact_zone() from compaction.c. */
    9108             :         unsigned int nr_reclaimed;
    9109             :         unsigned long pfn = start;
    9110             :         unsigned int tries = 0;
    9111             :         int ret = 0;
    9112             :         struct migration_target_control mtc = {
    9113             :                 .nid = zone_to_nid(cc->zone),
    9114             :                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
    9115             :         };
    9116             : 
    9117             :         lru_cache_disable();
    9118             : 
    9119             :         while (pfn < end || !list_empty(&cc->migratepages)) {
    9120             :                 if (fatal_signal_pending(current)) {
    9121             :                         ret = -EINTR;
    9122             :                         break;
    9123             :                 }
    9124             : 
    9125             :                 if (list_empty(&cc->migratepages)) {
    9126             :                         cc->nr_migratepages = 0;
    9127             :                         ret = isolate_migratepages_range(cc, pfn, end);
    9128             :                         if (ret && ret != -EAGAIN)
    9129             :                                 break;
    9130             :                         pfn = cc->migrate_pfn;
    9131             :                         tries = 0;
    9132             :                 } else if (++tries == 5) {
    9133             :                         ret = -EBUSY;
    9134             :                         break;
    9135             :                 }
    9136             : 
    9137             :                 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
    9138             :                                                         &cc->migratepages);
    9139             :                 cc->nr_migratepages -= nr_reclaimed;
    9140             : 
    9141             :                 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
    9142             :                         NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
    9143             : 
    9144             :                 /*
    9145             :                  * On -ENOMEM, migrate_pages() bails out right away. It is pointless
    9146             :                  * to retry again over this error, so do the same here.
    9147             :                  */
    9148             :                 if (ret == -ENOMEM)
    9149             :                         break;
    9150             :         }
    9151             : 
    9152             :         lru_cache_enable();
    9153             :         if (ret < 0) {
    9154             :                 if (ret == -EBUSY)
    9155             :                         alloc_contig_dump_pages(&cc->migratepages);
    9156             :                 putback_movable_pages(&cc->migratepages);
    9157             :                 return ret;
    9158             :         }
    9159             :         return 0;
    9160             : }
    9161             : 
    9162             : /**
    9163             :  * alloc_contig_range() -- tries to allocate given range of pages
    9164             :  * @start:      start PFN to allocate
    9165             :  * @end:        one-past-the-last PFN to allocate
    9166             :  * @migratetype:        migratetype of the underlying pageblocks (either
    9167             :  *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
    9168             :  *                      in range must have the same migratetype and it must
    9169             :  *                      be either of the two.
    9170             :  * @gfp_mask:   GFP mask to use during compaction
    9171             :  *
    9172             :  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
    9173             :  * aligned.  The PFN range must belong to a single zone.
    9174             :  *
    9175             :  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
    9176             :  * pageblocks in the range.  Once isolated, the pageblocks should not
    9177             :  * be modified by others.
    9178             :  *
    9179             :  * Return: zero on success or negative error code.  On success all
    9180             :  * pages which PFN is in [start, end) are allocated for the caller and
    9181             :  * need to be freed with free_contig_range().
    9182             :  */
    9183             : int alloc_contig_range(unsigned long start, unsigned long end,
    9184             :                        unsigned migratetype, gfp_t gfp_mask)
    9185             : {
    9186             :         unsigned long outer_start, outer_end;
    9187             :         unsigned int order;
    9188             :         int ret = 0;
    9189             : 
    9190             :         struct compact_control cc = {
    9191             :                 .nr_migratepages = 0,
    9192             :                 .order = -1,
    9193             :                 .zone = page_zone(pfn_to_page(start)),
    9194             :                 .mode = MIGRATE_SYNC,
    9195             :                 .ignore_skip_hint = true,
    9196             :                 .no_set_skip_hint = true,
    9197             :                 .gfp_mask = current_gfp_context(gfp_mask),
    9198             :                 .alloc_contig = true,
    9199             :         };
    9200             :         INIT_LIST_HEAD(&cc.migratepages);
    9201             : 
    9202             :         /*
    9203             :          * What we do here is we mark all pageblocks in range as
    9204             :          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
    9205             :          * have different sizes, and due to the way page allocator
    9206             :          * work, we align the range to biggest of the two pages so
    9207             :          * that page allocator won't try to merge buddies from
    9208             :          * different pageblocks and change MIGRATE_ISOLATE to some
    9209             :          * other migration type.
    9210             :          *
    9211             :          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
    9212             :          * migrate the pages from an unaligned range (ie. pages that
    9213             :          * we are interested in).  This will put all the pages in
    9214             :          * range back to page allocator as MIGRATE_ISOLATE.
    9215             :          *
    9216             :          * When this is done, we take the pages in range from page
    9217             :          * allocator removing them from the buddy system.  This way
    9218             :          * page allocator will never consider using them.
    9219             :          *
    9220             :          * This lets us mark the pageblocks back as
    9221             :          * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
    9222             :          * aligned range but not in the unaligned, original range are
    9223             :          * put back to page allocator so that buddy can use them.
    9224             :          */
    9225             : 
    9226             :         ret = start_isolate_page_range(pfn_max_align_down(start),
    9227             :                                        pfn_max_align_up(end), migratetype, 0);
    9228             :         if (ret)
    9229             :                 return ret;
    9230             : 
    9231             :         drain_all_pages(cc.zone);
    9232             : 
    9233             :         /*
    9234             :          * In case of -EBUSY, we'd like to know which page causes problem.
    9235             :          * So, just fall through. test_pages_isolated() has a tracepoint
    9236             :          * which will report the busy page.
    9237             :          *
    9238             :          * It is possible that busy pages could become available before
    9239             :          * the call to test_pages_isolated, and the range will actually be
    9240             :          * allocated.  So, if we fall through be sure to clear ret so that
    9241             :          * -EBUSY is not accidentally used or returned to caller.
    9242             :          */
    9243             :         ret = __alloc_contig_migrate_range(&cc, start, end);
    9244             :         if (ret && ret != -EBUSY)
    9245             :                 goto done;
    9246             :         ret = 0;
    9247             : 
    9248             :         /*
    9249             :          * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
    9250             :          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
    9251             :          * more, all pages in [start, end) are free in page allocator.
    9252             :          * What we are going to do is to allocate all pages from
    9253             :          * [start, end) (that is remove them from page allocator).
    9254             :          *
    9255             :          * The only problem is that pages at the beginning and at the
    9256             :          * end of interesting range may be not aligned with pages that
    9257             :          * page allocator holds, ie. they can be part of higher order
    9258             :          * pages.  Because of this, we reserve the bigger range and
    9259             :          * once this is done free the pages we are not interested in.
    9260             :          *
    9261             :          * We don't have to hold zone->lock here because the pages are
    9262             :          * isolated thus they won't get removed from buddy.
    9263             :          */
    9264             : 
    9265             :         order = 0;
    9266             :         outer_start = start;
    9267             :         while (!PageBuddy(pfn_to_page(outer_start))) {
    9268             :                 if (++order >= MAX_ORDER) {
    9269             :                         outer_start = start;
    9270             :                         break;
    9271             :                 }
    9272             :                 outer_start &= ~0UL << order;
    9273             :         }
    9274             : 
    9275             :         if (outer_start != start) {
    9276             :                 order = buddy_order(pfn_to_page(outer_start));
    9277             : 
    9278             :                 /*
    9279             :                  * outer_start page could be small order buddy page and
    9280             :                  * it doesn't include start page. Adjust outer_start
    9281             :                  * in this case to report failed page properly
    9282             :                  * on tracepoint in test_pages_isolated()
    9283             :                  */
    9284             :                 if (outer_start + (1UL << order) <= start)
    9285             :                         outer_start = start;
    9286             :         }
    9287             : 
    9288             :         /* Make sure the range is really isolated. */
    9289             :         if (test_pages_isolated(outer_start, end, 0)) {
    9290             :                 ret = -EBUSY;
    9291             :                 goto done;
    9292             :         }
    9293             : 
    9294             :         /* Grab isolated pages from freelists. */
    9295             :         outer_end = isolate_freepages_range(&cc, outer_start, end);
    9296             :         if (!outer_end) {
    9297             :                 ret = -EBUSY;
    9298             :                 goto done;
    9299             :         }
    9300             : 
    9301             :         /* Free head and tail (if any) */
    9302             :         if (start != outer_start)
    9303             :                 free_contig_range(outer_start, start - outer_start);
    9304             :         if (end != outer_end)
    9305             :                 free_contig_range(end, outer_end - end);
    9306             : 
    9307             : done:
    9308             :         undo_isolate_page_range(pfn_max_align_down(start),
    9309             :                                 pfn_max_align_up(end), migratetype);
    9310             :         return ret;
    9311             : }
    9312             : EXPORT_SYMBOL(alloc_contig_range);
    9313             : 
    9314             : static int __alloc_contig_pages(unsigned long start_pfn,
    9315             :                                 unsigned long nr_pages, gfp_t gfp_mask)
    9316             : {
    9317             :         unsigned long end_pfn = start_pfn + nr_pages;
    9318             : 
    9319             :         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
    9320             :                                   gfp_mask);
    9321             : }
    9322             : 
    9323             : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
    9324             :                                    unsigned long nr_pages)
    9325             : {
    9326             :         unsigned long i, end_pfn = start_pfn + nr_pages;
    9327             :         struct page *page;
    9328             : 
    9329             :         for (i = start_pfn; i < end_pfn; i++) {
    9330             :                 page = pfn_to_online_page(i);
    9331             :                 if (!page)
    9332             :                         return false;
    9333             : 
    9334             :                 if (page_zone(page) != z)
    9335             :                         return false;
    9336             : 
    9337             :                 if (PageReserved(page))
    9338             :                         return false;
    9339             :         }
    9340             :         return true;
    9341             : }
    9342             : 
    9343             : static bool zone_spans_last_pfn(const struct zone *zone,
    9344             :                                 unsigned long start_pfn, unsigned long nr_pages)
    9345             : {
    9346             :         unsigned long last_pfn = start_pfn + nr_pages - 1;
    9347             : 
    9348             :         return zone_spans_pfn(zone, last_pfn);
    9349             : }
    9350             : 
    9351             : /**
    9352             :  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
    9353             :  * @nr_pages:   Number of contiguous pages to allocate
    9354             :  * @gfp_mask:   GFP mask to limit search and used during compaction
    9355             :  * @nid:        Target node
    9356             :  * @nodemask:   Mask for other possible nodes
    9357             :  *
    9358             :  * This routine is a wrapper around alloc_contig_range(). It scans over zones
    9359             :  * on an applicable zonelist to find a contiguous pfn range which can then be
    9360             :  * tried for allocation with alloc_contig_range(). This routine is intended
    9361             :  * for allocation requests which can not be fulfilled with the buddy allocator.
    9362             :  *
    9363             :  * The allocated memory is always aligned to a page boundary. If nr_pages is a
    9364             :  * power of two, then allocated range is also guaranteed to be aligned to same
    9365             :  * nr_pages (e.g. 1GB request would be aligned to 1GB).
    9366             :  *
    9367             :  * Allocated pages can be freed with free_contig_range() or by manually calling
    9368             :  * __free_page() on each allocated page.
    9369             :  *
    9370             :  * Return: pointer to contiguous pages on success, or NULL if not successful.
    9371             :  */
    9372             : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
    9373             :                                 int nid, nodemask_t *nodemask)
    9374             : {
    9375             :         unsigned long ret, pfn, flags;
    9376             :         struct zonelist *zonelist;
    9377             :         struct zone *zone;
    9378             :         struct zoneref *z;
    9379             : 
    9380             :         zonelist = node_zonelist(nid, gfp_mask);
    9381             :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    9382             :                                         gfp_zone(gfp_mask), nodemask) {
    9383             :                 spin_lock_irqsave(&zone->lock, flags);
    9384             : 
    9385             :                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
    9386             :                 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
    9387             :                         if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
    9388             :                                 /*
    9389             :                                  * We release the zone lock here because
    9390             :                                  * alloc_contig_range() will also lock the zone
    9391             :                                  * at some point. If there's an allocation
    9392             :                                  * spinning on this lock, it may win the race
    9393             :                                  * and cause alloc_contig_range() to fail...
    9394             :                                  */
    9395             :                                 spin_unlock_irqrestore(&zone->lock, flags);
    9396             :                                 ret = __alloc_contig_pages(pfn, nr_pages,
    9397             :                                                         gfp_mask);
    9398             :                                 if (!ret)
    9399             :                                         return pfn_to_page(pfn);
    9400             :                                 spin_lock_irqsave(&zone->lock, flags);
    9401             :                         }
    9402             :                         pfn += nr_pages;
    9403             :                 }
    9404             :                 spin_unlock_irqrestore(&zone->lock, flags);
    9405             :         }
    9406             :         return NULL;
    9407             : }
    9408             : #endif /* CONFIG_CONTIG_ALLOC */
    9409             : 
    9410           0 : void free_contig_range(unsigned long pfn, unsigned long nr_pages)
    9411             : {
    9412           0 :         unsigned long count = 0;
    9413             : 
    9414           0 :         for (; nr_pages--; pfn++) {
    9415           0 :                 struct page *page = pfn_to_page(pfn);
    9416             : 
    9417           0 :                 count += page_count(page) != 1;
    9418           0 :                 __free_page(page);
    9419             :         }
    9420           0 :         WARN(count != 0, "%lu pages are still in use!\n", count);
    9421           0 : }
    9422             : EXPORT_SYMBOL(free_contig_range);
    9423             : 
    9424             : /*
    9425             :  * The zone indicated has a new number of managed_pages; batch sizes and percpu
    9426             :  * page high values need to be recalculated.
    9427             :  */
    9428           2 : void zone_pcp_update(struct zone *zone, int cpu_online)
    9429             : {
    9430           2 :         mutex_lock(&pcp_batch_high_lock);
    9431           2 :         zone_set_pageset_high_and_batch(zone, cpu_online);
    9432           2 :         mutex_unlock(&pcp_batch_high_lock);
    9433           2 : }
    9434             : 
    9435             : /*
    9436             :  * Effectively disable pcplists for the zone by setting the high limit to 0
    9437             :  * and draining all cpus. A concurrent page freeing on another CPU that's about
    9438             :  * to put the page on pcplist will either finish before the drain and the page
    9439             :  * will be drained, or observe the new high limit and skip the pcplist.
    9440             :  *
    9441             :  * Must be paired with a call to zone_pcp_enable().
    9442             :  */
    9443           0 : void zone_pcp_disable(struct zone *zone)
    9444             : {
    9445           0 :         mutex_lock(&pcp_batch_high_lock);
    9446           0 :         __zone_set_pageset_high_and_batch(zone, 0, 1);
    9447           0 :         __drain_all_pages(zone, true);
    9448           0 : }
    9449             : 
    9450           0 : void zone_pcp_enable(struct zone *zone)
    9451             : {
    9452           0 :         __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
    9453           0 :         mutex_unlock(&pcp_batch_high_lock);
    9454           0 : }
    9455             : 
    9456           0 : void zone_pcp_reset(struct zone *zone)
    9457             : {
    9458             :         int cpu;
    9459             :         struct per_cpu_zonestat *pzstats;
    9460             : 
    9461           0 :         if (zone->per_cpu_pageset != &boot_pageset) {
    9462             :                 for_each_online_cpu(cpu) {
    9463             :                         pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    9464             :                         drain_zonestat(zone, pzstats);
    9465             :                 }
    9466           0 :                 free_percpu(zone->per_cpu_pageset);
    9467           0 :                 free_percpu(zone->per_cpu_zonestats);
    9468           0 :                 zone->per_cpu_pageset = &boot_pageset;
    9469           0 :                 zone->per_cpu_zonestats = &boot_zonestats;
    9470             :         }
    9471           0 : }
    9472             : 
    9473             : #ifdef CONFIG_MEMORY_HOTREMOVE
    9474             : /*
    9475             :  * All pages in the range must be in a single zone, must not contain holes,
    9476             :  * must span full sections, and must be isolated before calling this function.
    9477             :  */
    9478             : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
    9479             : {
    9480             :         unsigned long pfn = start_pfn;
    9481             :         struct page *page;
    9482             :         struct zone *zone;
    9483             :         unsigned int order;
    9484             :         unsigned long flags;
    9485             : 
    9486             :         offline_mem_sections(pfn, end_pfn);
    9487             :         zone = page_zone(pfn_to_page(pfn));
    9488             :         spin_lock_irqsave(&zone->lock, flags);
    9489             :         while (pfn < end_pfn) {
    9490             :                 page = pfn_to_page(pfn);
    9491             :                 /*
    9492             :                  * The HWPoisoned page may be not in buddy system, and
    9493             :                  * page_count() is not 0.
    9494             :                  */
    9495             :                 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
    9496             :                         pfn++;
    9497             :                         continue;
    9498             :                 }
    9499             :                 /*
    9500             :                  * At this point all remaining PageOffline() pages have a
    9501             :                  * reference count of 0 and can simply be skipped.
    9502             :                  */
    9503             :                 if (PageOffline(page)) {
    9504             :                         BUG_ON(page_count(page));
    9505             :                         BUG_ON(PageBuddy(page));
    9506             :                         pfn++;
    9507             :                         continue;
    9508             :                 }
    9509             : 
    9510             :                 BUG_ON(page_count(page));
    9511             :                 BUG_ON(!PageBuddy(page));
    9512             :                 order = buddy_order(page);
    9513             :                 del_page_from_free_list(page, zone, order);
    9514             :                 pfn += (1 << order);
    9515             :         }
    9516             :         spin_unlock_irqrestore(&zone->lock, flags);
    9517             : }
    9518             : #endif
    9519             : 
    9520             : /*
    9521             :  * This function returns a stable result only if called under zone lock.
    9522             :  */
    9523           0 : bool is_free_buddy_page(struct page *page)
    9524             : {
    9525           0 :         unsigned long pfn = page_to_pfn(page);
    9526             :         unsigned int order;
    9527             : 
    9528           0 :         for (order = 0; order < MAX_ORDER; order++) {
    9529           0 :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    9530             : 
    9531           0 :                 if (PageBuddy(page_head) &&
    9532           0 :                     buddy_order_unsafe(page_head) >= order)
    9533             :                         break;
    9534             :         }
    9535             : 
    9536           0 :         return order < MAX_ORDER;
    9537             : }
    9538             : EXPORT_SYMBOL(is_free_buddy_page);
    9539             : 
    9540             : #ifdef CONFIG_MEMORY_FAILURE
    9541             : /*
    9542             :  * Break down a higher-order page in sub-pages, and keep our target out of
    9543             :  * buddy allocator.
    9544             :  */
    9545             : static void break_down_buddy_pages(struct zone *zone, struct page *page,
    9546             :                                    struct page *target, int low, int high,
    9547             :                                    int migratetype)
    9548             : {
    9549             :         unsigned long size = 1 << high;
    9550             :         struct page *current_buddy, *next_page;
    9551             : 
    9552             :         while (high > low) {
    9553             :                 high--;
    9554             :                 size >>= 1;
    9555             : 
    9556             :                 if (target >= &page[size]) {
    9557             :                         next_page = page + size;
    9558             :                         current_buddy = page;
    9559             :                 } else {
    9560             :                         next_page = page;
    9561             :                         current_buddy = page + size;
    9562             :                 }
    9563             : 
    9564             :                 if (set_page_guard(zone, current_buddy, high, migratetype))
    9565             :                         continue;
    9566             : 
    9567             :                 if (current_buddy != target) {
    9568             :                         add_to_free_list(current_buddy, zone, high, migratetype);
    9569             :                         set_buddy_order(current_buddy, high);
    9570             :                         page = next_page;
    9571             :                 }
    9572             :         }
    9573             : }
    9574             : 
    9575             : /*
    9576             :  * Take a page that will be marked as poisoned off the buddy allocator.
    9577             :  */
    9578             : bool take_page_off_buddy(struct page *page)
    9579             : {
    9580             :         struct zone *zone = page_zone(page);
    9581             :         unsigned long pfn = page_to_pfn(page);
    9582             :         unsigned long flags;
    9583             :         unsigned int order;
    9584             :         bool ret = false;
    9585             : 
    9586             :         spin_lock_irqsave(&zone->lock, flags);
    9587             :         for (order = 0; order < MAX_ORDER; order++) {
    9588             :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    9589             :                 int page_order = buddy_order(page_head);
    9590             : 
    9591             :                 if (PageBuddy(page_head) && page_order >= order) {
    9592             :                         unsigned long pfn_head = page_to_pfn(page_head);
    9593             :                         int migratetype = get_pfnblock_migratetype(page_head,
    9594             :                                                                    pfn_head);
    9595             : 
    9596             :                         del_page_from_free_list(page_head, zone, page_order);
    9597             :                         break_down_buddy_pages(zone, page_head, page, 0,
    9598             :                                                 page_order, migratetype);
    9599             :                         SetPageHWPoisonTakenOff(page);
    9600             :                         if (!is_migrate_isolate(migratetype))
    9601             :                                 __mod_zone_freepage_state(zone, -1, migratetype);
    9602             :                         ret = true;
    9603             :                         break;
    9604             :                 }
    9605             :                 if (page_count(page_head) > 0)
    9606             :                         break;
    9607             :         }
    9608             :         spin_unlock_irqrestore(&zone->lock, flags);
    9609             :         return ret;
    9610             : }
    9611             : 
    9612             : /*
    9613             :  * Cancel takeoff done by take_page_off_buddy().
    9614             :  */
    9615             : bool put_page_back_buddy(struct page *page)
    9616             : {
    9617             :         struct zone *zone = page_zone(page);
    9618             :         unsigned long pfn = page_to_pfn(page);
    9619             :         unsigned long flags;
    9620             :         int migratetype = get_pfnblock_migratetype(page, pfn);
    9621             :         bool ret = false;
    9622             : 
    9623             :         spin_lock_irqsave(&zone->lock, flags);
    9624             :         if (put_page_testzero(page)) {
    9625             :                 ClearPageHWPoisonTakenOff(page);
    9626             :                 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
    9627             :                 if (TestClearPageHWPoison(page)) {
    9628             :                         num_poisoned_pages_dec();
    9629             :                         ret = true;
    9630             :                 }
    9631             :         }
    9632             :         spin_unlock_irqrestore(&zone->lock, flags);
    9633             : 
    9634             :         return ret;
    9635             : }
    9636             : #endif
    9637             : 
    9638             : #ifdef CONFIG_ZONE_DMA
    9639             : bool has_managed_dma(void)
    9640             : {
    9641             :         struct pglist_data *pgdat;
    9642             : 
    9643             :         for_each_online_pgdat(pgdat) {
    9644             :                 struct zone *zone = &pgdat->node_zones[ZONE_DMA];
    9645             : 
    9646             :                 if (managed_zone(zone))
    9647             :                         return true;
    9648             :         }
    9649             :         return false;
    9650             : }
    9651             : #endif /* CONFIG_ZONE_DMA */

Generated by: LCOV version 1.14