LCOV - code coverage report
Current view: top level - mm - vmstat.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 7 209 3.3 %
Date: 2022-12-09 01:23:36 Functions: 1 24 4.2 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/vmstat.c
       4             :  *
       5             :  *  Manages VM statistics
       6             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       7             :  *
       8             :  *  zoned VM statistics
       9             :  *  Copyright (C) 2006 Silicon Graphics, Inc.,
      10             :  *              Christoph Lameter <christoph@lameter.com>
      11             :  *  Copyright (C) 2008-2014 Christoph Lameter
      12             :  */
      13             : #include <linux/fs.h>
      14             : #include <linux/mm.h>
      15             : #include <linux/err.h>
      16             : #include <linux/module.h>
      17             : #include <linux/slab.h>
      18             : #include <linux/cpu.h>
      19             : #include <linux/cpumask.h>
      20             : #include <linux/vmstat.h>
      21             : #include <linux/proc_fs.h>
      22             : #include <linux/seq_file.h>
      23             : #include <linux/debugfs.h>
      24             : #include <linux/sched.h>
      25             : #include <linux/math64.h>
      26             : #include <linux/writeback.h>
      27             : #include <linux/compaction.h>
      28             : #include <linux/mm_inline.h>
      29             : #include <linux/page_ext.h>
      30             : #include <linux/page_owner.h>
      31             : #include <linux/migrate.h>
      32             : 
      33             : #include "internal.h"
      34             : 
      35             : #ifdef CONFIG_NUMA
      36             : int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
      37             : 
      38             : /* zero numa counters within a zone */
      39             : static void zero_zone_numa_counters(struct zone *zone)
      40             : {
      41             :         int item, cpu;
      42             : 
      43             :         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
      44             :                 atomic_long_set(&zone->vm_numa_event[item], 0);
      45             :                 for_each_online_cpu(cpu) {
      46             :                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
      47             :                                                 = 0;
      48             :                 }
      49             :         }
      50             : }
      51             : 
      52             : /* zero numa counters of all the populated zones */
      53             : static void zero_zones_numa_counters(void)
      54             : {
      55             :         struct zone *zone;
      56             : 
      57             :         for_each_populated_zone(zone)
      58             :                 zero_zone_numa_counters(zone);
      59             : }
      60             : 
      61             : /* zero global numa counters */
      62             : static void zero_global_numa_counters(void)
      63             : {
      64             :         int item;
      65             : 
      66             :         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
      67             :                 atomic_long_set(&vm_numa_event[item], 0);
      68             : }
      69             : 
      70             : static void invalid_numa_statistics(void)
      71             : {
      72             :         zero_zones_numa_counters();
      73             :         zero_global_numa_counters();
      74             : }
      75             : 
      76             : static DEFINE_MUTEX(vm_numa_stat_lock);
      77             : 
      78             : int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
      79             :                 void *buffer, size_t *length, loff_t *ppos)
      80             : {
      81             :         int ret, oldval;
      82             : 
      83             :         mutex_lock(&vm_numa_stat_lock);
      84             :         if (write)
      85             :                 oldval = sysctl_vm_numa_stat;
      86             :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
      87             :         if (ret || !write)
      88             :                 goto out;
      89             : 
      90             :         if (oldval == sysctl_vm_numa_stat)
      91             :                 goto out;
      92             :         else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
      93             :                 static_branch_enable(&vm_numa_stat_key);
      94             :                 pr_info("enable numa statistics\n");
      95             :         } else {
      96             :                 static_branch_disable(&vm_numa_stat_key);
      97             :                 invalid_numa_statistics();
      98             :                 pr_info("disable numa statistics, and clear numa counters\n");
      99             :         }
     100             : 
     101             : out:
     102             :         mutex_unlock(&vm_numa_stat_lock);
     103             :         return ret;
     104             : }
     105             : #endif
     106             : 
     107             : #ifdef CONFIG_VM_EVENT_COUNTERS
     108             : DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
     109             : EXPORT_PER_CPU_SYMBOL(vm_event_states);
     110             : 
     111           0 : static void sum_vm_events(unsigned long *ret)
     112             : {
     113             :         int cpu;
     114             :         int i;
     115             : 
     116           0 :         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
     117             : 
     118           0 :         for_each_online_cpu(cpu) {
     119             :                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
     120             : 
     121           0 :                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
     122           0 :                         ret[i] += this->event[i];
     123             :         }
     124           0 : }
     125             : 
     126             : /*
     127             :  * Accumulate the vm event counters across all CPUs.
     128             :  * The result is unavoidably approximate - it can change
     129             :  * during and after execution of this function.
     130             : */
     131           0 : void all_vm_events(unsigned long *ret)
     132             : {
     133             :         cpus_read_lock();
     134           0 :         sum_vm_events(ret);
     135             :         cpus_read_unlock();
     136           0 : }
     137             : EXPORT_SYMBOL_GPL(all_vm_events);
     138             : 
     139             : /*
     140             :  * Fold the foreign cpu events into our own.
     141             :  *
     142             :  * This is adding to the events on one processor
     143             :  * but keeps the global counts constant.
     144             :  */
     145           0 : void vm_events_fold_cpu(int cpu)
     146             : {
     147           0 :         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
     148             :         int i;
     149             : 
     150           0 :         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
     151           0 :                 count_vm_events(i, fold_state->event[i]);
     152           0 :                 fold_state->event[i] = 0;
     153             :         }
     154           0 : }
     155             : 
     156             : #endif /* CONFIG_VM_EVENT_COUNTERS */
     157             : 
     158             : /*
     159             :  * Manage combined zone based / global counters
     160             :  *
     161             :  * vm_stat contains the global counters
     162             :  */
     163             : atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
     164             : atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
     165             : atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
     166             : EXPORT_SYMBOL(vm_zone_stat);
     167             : EXPORT_SYMBOL(vm_node_stat);
     168             : 
     169             : #ifdef CONFIG_NUMA
     170             : static void fold_vm_zone_numa_events(struct zone *zone)
     171             : {
     172             :         unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
     173             :         int cpu;
     174             :         enum numa_stat_item item;
     175             : 
     176             :         for_each_online_cpu(cpu) {
     177             :                 struct per_cpu_zonestat *pzstats;
     178             : 
     179             :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
     180             :                 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
     181             :                         zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
     182             :         }
     183             : 
     184             :         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
     185             :                 zone_numa_event_add(zone_numa_events[item], zone, item);
     186             : }
     187             : 
     188             : void fold_vm_numa_events(void)
     189             : {
     190             :         struct zone *zone;
     191             : 
     192             :         for_each_populated_zone(zone)
     193             :                 fold_vm_zone_numa_events(zone);
     194             : }
     195             : #endif
     196             : 
     197             : #ifdef CONFIG_SMP
     198             : 
     199             : int calculate_pressure_threshold(struct zone *zone)
     200             : {
     201             :         int threshold;
     202             :         int watermark_distance;
     203             : 
     204             :         /*
     205             :          * As vmstats are not up to date, there is drift between the estimated
     206             :          * and real values. For high thresholds and a high number of CPUs, it
     207             :          * is possible for the min watermark to be breached while the estimated
     208             :          * value looks fine. The pressure threshold is a reduced value such
     209             :          * that even the maximum amount of drift will not accidentally breach
     210             :          * the min watermark
     211             :          */
     212             :         watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
     213             :         threshold = max(1, (int)(watermark_distance / num_online_cpus()));
     214             : 
     215             :         /*
     216             :          * Maximum threshold is 125
     217             :          */
     218             :         threshold = min(125, threshold);
     219             : 
     220             :         return threshold;
     221             : }
     222             : 
     223             : int calculate_normal_threshold(struct zone *zone)
     224             : {
     225             :         int threshold;
     226             :         int mem;        /* memory in 128 MB units */
     227             : 
     228             :         /*
     229             :          * The threshold scales with the number of processors and the amount
     230             :          * of memory per zone. More memory means that we can defer updates for
     231             :          * longer, more processors could lead to more contention.
     232             :          * fls() is used to have a cheap way of logarithmic scaling.
     233             :          *
     234             :          * Some sample thresholds:
     235             :          *
     236             :          * Threshold    Processors      (fls)   Zonesize        fls(mem)+1
     237             :          * ------------------------------------------------------------------
     238             :          * 8            1               1       0.9-1 GB        4
     239             :          * 16           2               2       0.9-1 GB        4
     240             :          * 20           2               2       1-2 GB          5
     241             :          * 24           2               2       2-4 GB          6
     242             :          * 28           2               2       4-8 GB          7
     243             :          * 32           2               2       8-16 GB         8
     244             :          * 4            2               2       <128M                1
     245             :          * 30           4               3       2-4 GB          5
     246             :          * 48           4               3       8-16 GB         8
     247             :          * 32           8               4       1-2 GB          4
     248             :          * 32           8               4       0.9-1GB         4
     249             :          * 10           16              5       <128M                1
     250             :          * 40           16              5       900M            4
     251             :          * 70           64              7       2-4 GB          5
     252             :          * 84           64              7       4-8 GB          6
     253             :          * 108          512             9       4-8 GB          6
     254             :          * 125          1024            10      8-16 GB         8
     255             :          * 125          1024            10      16-32 GB        9
     256             :          */
     257             : 
     258             :         mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
     259             : 
     260             :         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
     261             : 
     262             :         /*
     263             :          * Maximum threshold is 125
     264             :          */
     265             :         threshold = min(125, threshold);
     266             : 
     267             :         return threshold;
     268             : }
     269             : 
     270             : /*
     271             :  * Refresh the thresholds for each zone.
     272             :  */
     273             : void refresh_zone_stat_thresholds(void)
     274             : {
     275             :         struct pglist_data *pgdat;
     276             :         struct zone *zone;
     277             :         int cpu;
     278             :         int threshold;
     279             : 
     280             :         /* Zero current pgdat thresholds */
     281             :         for_each_online_pgdat(pgdat) {
     282             :                 for_each_online_cpu(cpu) {
     283             :                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
     284             :                 }
     285             :         }
     286             : 
     287             :         for_each_populated_zone(zone) {
     288             :                 struct pglist_data *pgdat = zone->zone_pgdat;
     289             :                 unsigned long max_drift, tolerate_drift;
     290             : 
     291             :                 threshold = calculate_normal_threshold(zone);
     292             : 
     293             :                 for_each_online_cpu(cpu) {
     294             :                         int pgdat_threshold;
     295             : 
     296             :                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
     297             :                                                         = threshold;
     298             : 
     299             :                         /* Base nodestat threshold on the largest populated zone. */
     300             :                         pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
     301             :                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
     302             :                                 = max(threshold, pgdat_threshold);
     303             :                 }
     304             : 
     305             :                 /*
     306             :                  * Only set percpu_drift_mark if there is a danger that
     307             :                  * NR_FREE_PAGES reports the low watermark is ok when in fact
     308             :                  * the min watermark could be breached by an allocation
     309             :                  */
     310             :                 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
     311             :                 max_drift = num_online_cpus() * threshold;
     312             :                 if (max_drift > tolerate_drift)
     313             :                         zone->percpu_drift_mark = high_wmark_pages(zone) +
     314             :                                         max_drift;
     315             :         }
     316             : }
     317             : 
     318             : void set_pgdat_percpu_threshold(pg_data_t *pgdat,
     319             :                                 int (*calculate_pressure)(struct zone *))
     320             : {
     321             :         struct zone *zone;
     322             :         int cpu;
     323             :         int threshold;
     324             :         int i;
     325             : 
     326             :         for (i = 0; i < pgdat->nr_zones; i++) {
     327             :                 zone = &pgdat->node_zones[i];
     328             :                 if (!zone->percpu_drift_mark)
     329             :                         continue;
     330             : 
     331             :                 threshold = (*calculate_pressure)(zone);
     332             :                 for_each_online_cpu(cpu)
     333             :                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
     334             :                                                         = threshold;
     335             :         }
     336             : }
     337             : 
     338             : /*
     339             :  * For use when we know that interrupts are disabled,
     340             :  * or when we know that preemption is disabled and that
     341             :  * particular counter cannot be updated from interrupt context.
     342             :  */
     343             : void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
     344             :                            long delta)
     345             : {
     346             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     347             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     348             :         long x;
     349             :         long t;
     350             : 
     351             :         /*
     352             :          * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
     353             :          * atomicity is provided by IRQs being disabled -- either explicitly
     354             :          * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
     355             :          * CPU migrations and preemption potentially corrupts a counter so
     356             :          * disable preemption.
     357             :          */
     358             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     359             :                 preempt_disable();
     360             : 
     361             :         x = delta + __this_cpu_read(*p);
     362             : 
     363             :         t = __this_cpu_read(pcp->stat_threshold);
     364             : 
     365             :         if (unlikely(abs(x) > t)) {
     366             :                 zone_page_state_add(x, zone, item);
     367             :                 x = 0;
     368             :         }
     369             :         __this_cpu_write(*p, x);
     370             : 
     371             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     372             :                 preempt_enable();
     373             : }
     374             : EXPORT_SYMBOL(__mod_zone_page_state);
     375             : 
     376             : void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
     377             :                                 long delta)
     378             : {
     379             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     380             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     381             :         long x;
     382             :         long t;
     383             : 
     384             :         if (vmstat_item_in_bytes(item)) {
     385             :                 /*
     386             :                  * Only cgroups use subpage accounting right now; at
     387             :                  * the global level, these items still change in
     388             :                  * multiples of whole pages. Store them as pages
     389             :                  * internally to keep the per-cpu counters compact.
     390             :                  */
     391             :                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
     392             :                 delta >>= PAGE_SHIFT;
     393             :         }
     394             : 
     395             :         /* See __mod_node_page_state */
     396             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     397             :                 preempt_disable();
     398             : 
     399             :         x = delta + __this_cpu_read(*p);
     400             : 
     401             :         t = __this_cpu_read(pcp->stat_threshold);
     402             : 
     403             :         if (unlikely(abs(x) > t)) {
     404             :                 node_page_state_add(x, pgdat, item);
     405             :                 x = 0;
     406             :         }
     407             :         __this_cpu_write(*p, x);
     408             : 
     409             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     410             :                 preempt_enable();
     411             : }
     412             : EXPORT_SYMBOL(__mod_node_page_state);
     413             : 
     414             : /*
     415             :  * Optimized increment and decrement functions.
     416             :  *
     417             :  * These are only for a single page and therefore can take a struct page *
     418             :  * argument instead of struct zone *. This allows the inclusion of the code
     419             :  * generated for page_zone(page) into the optimized functions.
     420             :  *
     421             :  * No overflow check is necessary and therefore the differential can be
     422             :  * incremented or decremented in place which may allow the compilers to
     423             :  * generate better code.
     424             :  * The increment or decrement is known and therefore one boundary check can
     425             :  * be omitted.
     426             :  *
     427             :  * NOTE: These functions are very performance sensitive. Change only
     428             :  * with care.
     429             :  *
     430             :  * Some processors have inc/dec instructions that are atomic vs an interrupt.
     431             :  * However, the code must first determine the differential location in a zone
     432             :  * based on the processor number and then inc/dec the counter. There is no
     433             :  * guarantee without disabling preemption that the processor will not change
     434             :  * in between and therefore the atomicity vs. interrupt cannot be exploited
     435             :  * in a useful way here.
     436             :  */
     437             : void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
     438             : {
     439             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     440             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     441             :         s8 v, t;
     442             : 
     443             :         /* See __mod_node_page_state */
     444             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     445             :                 preempt_disable();
     446             : 
     447             :         v = __this_cpu_inc_return(*p);
     448             :         t = __this_cpu_read(pcp->stat_threshold);
     449             :         if (unlikely(v > t)) {
     450             :                 s8 overstep = t >> 1;
     451             : 
     452             :                 zone_page_state_add(v + overstep, zone, item);
     453             :                 __this_cpu_write(*p, -overstep);
     454             :         }
     455             : 
     456             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     457             :                 preempt_enable();
     458             : }
     459             : 
     460             : void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     461             : {
     462             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     463             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     464             :         s8 v, t;
     465             : 
     466             :         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
     467             : 
     468             :         /* See __mod_node_page_state */
     469             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     470             :                 preempt_disable();
     471             : 
     472             :         v = __this_cpu_inc_return(*p);
     473             :         t = __this_cpu_read(pcp->stat_threshold);
     474             :         if (unlikely(v > t)) {
     475             :                 s8 overstep = t >> 1;
     476             : 
     477             :                 node_page_state_add(v + overstep, pgdat, item);
     478             :                 __this_cpu_write(*p, -overstep);
     479             :         }
     480             : 
     481             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     482             :                 preempt_enable();
     483             : }
     484             : 
     485             : void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
     486             : {
     487             :         __inc_zone_state(page_zone(page), item);
     488             : }
     489             : EXPORT_SYMBOL(__inc_zone_page_state);
     490             : 
     491             : void __inc_node_page_state(struct page *page, enum node_stat_item item)
     492             : {
     493             :         __inc_node_state(page_pgdat(page), item);
     494             : }
     495             : EXPORT_SYMBOL(__inc_node_page_state);
     496             : 
     497             : void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
     498             : {
     499             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     500             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     501             :         s8 v, t;
     502             : 
     503             :         /* See __mod_node_page_state */
     504             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     505             :                 preempt_disable();
     506             : 
     507             :         v = __this_cpu_dec_return(*p);
     508             :         t = __this_cpu_read(pcp->stat_threshold);
     509             :         if (unlikely(v < - t)) {
     510             :                 s8 overstep = t >> 1;
     511             : 
     512             :                 zone_page_state_add(v - overstep, zone, item);
     513             :                 __this_cpu_write(*p, overstep);
     514             :         }
     515             : 
     516             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     517             :                 preempt_enable();
     518             : }
     519             : 
     520             : void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     521             : {
     522             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     523             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     524             :         s8 v, t;
     525             : 
     526             :         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
     527             : 
     528             :         /* See __mod_node_page_state */
     529             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     530             :                 preempt_disable();
     531             : 
     532             :         v = __this_cpu_dec_return(*p);
     533             :         t = __this_cpu_read(pcp->stat_threshold);
     534             :         if (unlikely(v < - t)) {
     535             :                 s8 overstep = t >> 1;
     536             : 
     537             :                 node_page_state_add(v - overstep, pgdat, item);
     538             :                 __this_cpu_write(*p, overstep);
     539             :         }
     540             : 
     541             :         if (IS_ENABLED(CONFIG_PREEMPT_RT))
     542             :                 preempt_enable();
     543             : }
     544             : 
     545             : void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
     546             : {
     547             :         __dec_zone_state(page_zone(page), item);
     548             : }
     549             : EXPORT_SYMBOL(__dec_zone_page_state);
     550             : 
     551             : void __dec_node_page_state(struct page *page, enum node_stat_item item)
     552             : {
     553             :         __dec_node_state(page_pgdat(page), item);
     554             : }
     555             : EXPORT_SYMBOL(__dec_node_page_state);
     556             : 
     557             : #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
     558             : /*
     559             :  * If we have cmpxchg_local support then we do not need to incur the overhead
     560             :  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
     561             :  *
     562             :  * mod_state() modifies the zone counter state through atomic per cpu
     563             :  * operations.
     564             :  *
     565             :  * Overstep mode specifies how overstep should handled:
     566             :  *     0       No overstepping
     567             :  *     1       Overstepping half of threshold
     568             :  *     -1      Overstepping minus half of threshold
     569             : */
     570             : static inline void mod_zone_state(struct zone *zone,
     571             :        enum zone_stat_item item, long delta, int overstep_mode)
     572             : {
     573             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     574             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     575             :         long o, n, t, z;
     576             : 
     577             :         do {
     578             :                 z = 0;  /* overflow to zone counters */
     579             : 
     580             :                 /*
     581             :                  * The fetching of the stat_threshold is racy. We may apply
     582             :                  * a counter threshold to the wrong the cpu if we get
     583             :                  * rescheduled while executing here. However, the next
     584             :                  * counter update will apply the threshold again and
     585             :                  * therefore bring the counter under the threshold again.
     586             :                  *
     587             :                  * Most of the time the thresholds are the same anyways
     588             :                  * for all cpus in a zone.
     589             :                  */
     590             :                 t = this_cpu_read(pcp->stat_threshold);
     591             : 
     592             :                 o = this_cpu_read(*p);
     593             :                 n = delta + o;
     594             : 
     595             :                 if (abs(n) > t) {
     596             :                         int os = overstep_mode * (t >> 1) ;
     597             : 
     598             :                         /* Overflow must be added to zone counters */
     599             :                         z = n + os;
     600             :                         n = -os;
     601             :                 }
     602             :         } while (this_cpu_cmpxchg(*p, o, n) != o);
     603             : 
     604             :         if (z)
     605             :                 zone_page_state_add(z, zone, item);
     606             : }
     607             : 
     608             : void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
     609             :                          long delta)
     610             : {
     611             :         mod_zone_state(zone, item, delta, 0);
     612             : }
     613             : EXPORT_SYMBOL(mod_zone_page_state);
     614             : 
     615             : void inc_zone_page_state(struct page *page, enum zone_stat_item item)
     616             : {
     617             :         mod_zone_state(page_zone(page), item, 1, 1);
     618             : }
     619             : EXPORT_SYMBOL(inc_zone_page_state);
     620             : 
     621             : void dec_zone_page_state(struct page *page, enum zone_stat_item item)
     622             : {
     623             :         mod_zone_state(page_zone(page), item, -1, -1);
     624             : }
     625             : EXPORT_SYMBOL(dec_zone_page_state);
     626             : 
     627             : static inline void mod_node_state(struct pglist_data *pgdat,
     628             :        enum node_stat_item item, int delta, int overstep_mode)
     629             : {
     630             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     631             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     632             :         long o, n, t, z;
     633             : 
     634             :         if (vmstat_item_in_bytes(item)) {
     635             :                 /*
     636             :                  * Only cgroups use subpage accounting right now; at
     637             :                  * the global level, these items still change in
     638             :                  * multiples of whole pages. Store them as pages
     639             :                  * internally to keep the per-cpu counters compact.
     640             :                  */
     641             :                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
     642             :                 delta >>= PAGE_SHIFT;
     643             :         }
     644             : 
     645             :         do {
     646             :                 z = 0;  /* overflow to node counters */
     647             : 
     648             :                 /*
     649             :                  * The fetching of the stat_threshold is racy. We may apply
     650             :                  * a counter threshold to the wrong the cpu if we get
     651             :                  * rescheduled while executing here. However, the next
     652             :                  * counter update will apply the threshold again and
     653             :                  * therefore bring the counter under the threshold again.
     654             :                  *
     655             :                  * Most of the time the thresholds are the same anyways
     656             :                  * for all cpus in a node.
     657             :                  */
     658             :                 t = this_cpu_read(pcp->stat_threshold);
     659             : 
     660             :                 o = this_cpu_read(*p);
     661             :                 n = delta + o;
     662             : 
     663             :                 if (abs(n) > t) {
     664             :                         int os = overstep_mode * (t >> 1) ;
     665             : 
     666             :                         /* Overflow must be added to node counters */
     667             :                         z = n + os;
     668             :                         n = -os;
     669             :                 }
     670             :         } while (this_cpu_cmpxchg(*p, o, n) != o);
     671             : 
     672             :         if (z)
     673             :                 node_page_state_add(z, pgdat, item);
     674             : }
     675             : 
     676             : void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
     677             :                                         long delta)
     678             : {
     679             :         mod_node_state(pgdat, item, delta, 0);
     680             : }
     681             : EXPORT_SYMBOL(mod_node_page_state);
     682             : 
     683             : void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     684             : {
     685             :         mod_node_state(pgdat, item, 1, 1);
     686             : }
     687             : 
     688             : void inc_node_page_state(struct page *page, enum node_stat_item item)
     689             : {
     690             :         mod_node_state(page_pgdat(page), item, 1, 1);
     691             : }
     692             : EXPORT_SYMBOL(inc_node_page_state);
     693             : 
     694             : void dec_node_page_state(struct page *page, enum node_stat_item item)
     695             : {
     696             :         mod_node_state(page_pgdat(page), item, -1, -1);
     697             : }
     698             : EXPORT_SYMBOL(dec_node_page_state);
     699             : #else
     700             : /*
     701             :  * Use interrupt disable to serialize counter updates
     702             :  */
     703             : void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
     704             :                          long delta)
     705             : {
     706             :         unsigned long flags;
     707             : 
     708             :         local_irq_save(flags);
     709             :         __mod_zone_page_state(zone, item, delta);
     710             :         local_irq_restore(flags);
     711             : }
     712             : EXPORT_SYMBOL(mod_zone_page_state);
     713             : 
     714             : void inc_zone_page_state(struct page *page, enum zone_stat_item item)
     715             : {
     716             :         unsigned long flags;
     717             :         struct zone *zone;
     718             : 
     719             :         zone = page_zone(page);
     720             :         local_irq_save(flags);
     721             :         __inc_zone_state(zone, item);
     722             :         local_irq_restore(flags);
     723             : }
     724             : EXPORT_SYMBOL(inc_zone_page_state);
     725             : 
     726             : void dec_zone_page_state(struct page *page, enum zone_stat_item item)
     727             : {
     728             :         unsigned long flags;
     729             : 
     730             :         local_irq_save(flags);
     731             :         __dec_zone_page_state(page, item);
     732             :         local_irq_restore(flags);
     733             : }
     734             : EXPORT_SYMBOL(dec_zone_page_state);
     735             : 
     736             : void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     737             : {
     738             :         unsigned long flags;
     739             : 
     740             :         local_irq_save(flags);
     741             :         __inc_node_state(pgdat, item);
     742             :         local_irq_restore(flags);
     743             : }
     744             : EXPORT_SYMBOL(inc_node_state);
     745             : 
     746             : void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
     747             :                                         long delta)
     748             : {
     749             :         unsigned long flags;
     750             : 
     751             :         local_irq_save(flags);
     752             :         __mod_node_page_state(pgdat, item, delta);
     753             :         local_irq_restore(flags);
     754             : }
     755             : EXPORT_SYMBOL(mod_node_page_state);
     756             : 
     757             : void inc_node_page_state(struct page *page, enum node_stat_item item)
     758             : {
     759             :         unsigned long flags;
     760             :         struct pglist_data *pgdat;
     761             : 
     762             :         pgdat = page_pgdat(page);
     763             :         local_irq_save(flags);
     764             :         __inc_node_state(pgdat, item);
     765             :         local_irq_restore(flags);
     766             : }
     767             : EXPORT_SYMBOL(inc_node_page_state);
     768             : 
     769             : void dec_node_page_state(struct page *page, enum node_stat_item item)
     770             : {
     771             :         unsigned long flags;
     772             : 
     773             :         local_irq_save(flags);
     774             :         __dec_node_page_state(page, item);
     775             :         local_irq_restore(flags);
     776             : }
     777             : EXPORT_SYMBOL(dec_node_page_state);
     778             : #endif
     779             : 
     780             : /*
     781             :  * Fold a differential into the global counters.
     782             :  * Returns the number of counters updated.
     783             :  */
     784             : static int fold_diff(int *zone_diff, int *node_diff)
     785             : {
     786             :         int i;
     787             :         int changes = 0;
     788             : 
     789             :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
     790             :                 if (zone_diff[i]) {
     791             :                         atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
     792             :                         changes++;
     793             :         }
     794             : 
     795             :         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
     796             :                 if (node_diff[i]) {
     797             :                         atomic_long_add(node_diff[i], &vm_node_stat[i]);
     798             :                         changes++;
     799             :         }
     800             :         return changes;
     801             : }
     802             : 
     803             : /*
     804             :  * Update the zone counters for the current cpu.
     805             :  *
     806             :  * Note that refresh_cpu_vm_stats strives to only access
     807             :  * node local memory. The per cpu pagesets on remote zones are placed
     808             :  * in the memory local to the processor using that pageset. So the
     809             :  * loop over all zones will access a series of cachelines local to
     810             :  * the processor.
     811             :  *
     812             :  * The call to zone_page_state_add updates the cachelines with the
     813             :  * statistics in the remote zone struct as well as the global cachelines
     814             :  * with the global counters. These could cause remote node cache line
     815             :  * bouncing and will have to be only done when necessary.
     816             :  *
     817             :  * The function returns the number of global counters updated.
     818             :  */
     819             : static int refresh_cpu_vm_stats(bool do_pagesets)
     820             : {
     821             :         struct pglist_data *pgdat;
     822             :         struct zone *zone;
     823             :         int i;
     824             :         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
     825             :         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
     826             :         int changes = 0;
     827             : 
     828             :         for_each_populated_zone(zone) {
     829             :                 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
     830             : #ifdef CONFIG_NUMA
     831             :                 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
     832             : #endif
     833             : 
     834             :                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
     835             :                         int v;
     836             : 
     837             :                         v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
     838             :                         if (v) {
     839             : 
     840             :                                 atomic_long_add(v, &zone->vm_stat[i]);
     841             :                                 global_zone_diff[i] += v;
     842             : #ifdef CONFIG_NUMA
     843             :                                 /* 3 seconds idle till flush */
     844             :                                 __this_cpu_write(pcp->expire, 3);
     845             : #endif
     846             :                         }
     847             :                 }
     848             : #ifdef CONFIG_NUMA
     849             : 
     850             :                 if (do_pagesets) {
     851             :                         cond_resched();
     852             :                         /*
     853             :                          * Deal with draining the remote pageset of this
     854             :                          * processor
     855             :                          *
     856             :                          * Check if there are pages remaining in this pageset
     857             :                          * if not then there is nothing to expire.
     858             :                          */
     859             :                         if (!__this_cpu_read(pcp->expire) ||
     860             :                                !__this_cpu_read(pcp->count))
     861             :                                 continue;
     862             : 
     863             :                         /*
     864             :                          * We never drain zones local to this processor.
     865             :                          */
     866             :                         if (zone_to_nid(zone) == numa_node_id()) {
     867             :                                 __this_cpu_write(pcp->expire, 0);
     868             :                                 continue;
     869             :                         }
     870             : 
     871             :                         if (__this_cpu_dec_return(pcp->expire))
     872             :                                 continue;
     873             : 
     874             :                         if (__this_cpu_read(pcp->count)) {
     875             :                                 drain_zone_pages(zone, this_cpu_ptr(pcp));
     876             :                                 changes++;
     877             :                         }
     878             :                 }
     879             : #endif
     880             :         }
     881             : 
     882             :         for_each_online_pgdat(pgdat) {
     883             :                 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
     884             : 
     885             :                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
     886             :                         int v;
     887             : 
     888             :                         v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
     889             :                         if (v) {
     890             :                                 atomic_long_add(v, &pgdat->vm_stat[i]);
     891             :                                 global_node_diff[i] += v;
     892             :                         }
     893             :                 }
     894             :         }
     895             : 
     896             :         changes += fold_diff(global_zone_diff, global_node_diff);
     897             :         return changes;
     898             : }
     899             : 
     900             : /*
     901             :  * Fold the data for an offline cpu into the global array.
     902             :  * There cannot be any access by the offline cpu and therefore
     903             :  * synchronization is simplified.
     904             :  */
     905             : void cpu_vm_stats_fold(int cpu)
     906             : {
     907             :         struct pglist_data *pgdat;
     908             :         struct zone *zone;
     909             :         int i;
     910             :         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
     911             :         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
     912             : 
     913             :         for_each_populated_zone(zone) {
     914             :                 struct per_cpu_zonestat *pzstats;
     915             : 
     916             :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
     917             : 
     918             :                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
     919             :                         if (pzstats->vm_stat_diff[i]) {
     920             :                                 int v;
     921             : 
     922             :                                 v = pzstats->vm_stat_diff[i];
     923             :                                 pzstats->vm_stat_diff[i] = 0;
     924             :                                 atomic_long_add(v, &zone->vm_stat[i]);
     925             :                                 global_zone_diff[i] += v;
     926             :                         }
     927             :                 }
     928             : #ifdef CONFIG_NUMA
     929             :                 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
     930             :                         if (pzstats->vm_numa_event[i]) {
     931             :                                 unsigned long v;
     932             : 
     933             :                                 v = pzstats->vm_numa_event[i];
     934             :                                 pzstats->vm_numa_event[i] = 0;
     935             :                                 zone_numa_event_add(v, zone, i);
     936             :                         }
     937             :                 }
     938             : #endif
     939             :         }
     940             : 
     941             :         for_each_online_pgdat(pgdat) {
     942             :                 struct per_cpu_nodestat *p;
     943             : 
     944             :                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
     945             : 
     946             :                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
     947             :                         if (p->vm_node_stat_diff[i]) {
     948             :                                 int v;
     949             : 
     950             :                                 v = p->vm_node_stat_diff[i];
     951             :                                 p->vm_node_stat_diff[i] = 0;
     952             :                                 atomic_long_add(v, &pgdat->vm_stat[i]);
     953             :                                 global_node_diff[i] += v;
     954             :                         }
     955             :         }
     956             : 
     957             :         fold_diff(global_zone_diff, global_node_diff);
     958             : }
     959             : 
     960             : /*
     961             :  * this is only called if !populated_zone(zone), which implies no other users of
     962             :  * pset->vm_stat_diff[] exist.
     963             :  */
     964             : void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
     965             : {
     966             :         unsigned long v;
     967             :         int i;
     968             : 
     969             :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
     970             :                 if (pzstats->vm_stat_diff[i]) {
     971             :                         v = pzstats->vm_stat_diff[i];
     972             :                         pzstats->vm_stat_diff[i] = 0;
     973             :                         zone_page_state_add(v, zone, i);
     974             :                 }
     975             :         }
     976             : 
     977             : #ifdef CONFIG_NUMA
     978             :         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
     979             :                 if (pzstats->vm_numa_event[i]) {
     980             :                         v = pzstats->vm_numa_event[i];
     981             :                         pzstats->vm_numa_event[i] = 0;
     982             :                         zone_numa_event_add(v, zone, i);
     983             :                 }
     984             :         }
     985             : #endif
     986             : }
     987             : #endif
     988             : 
     989             : #ifdef CONFIG_NUMA
     990             : /*
     991             :  * Determine the per node value of a stat item. This function
     992             :  * is called frequently in a NUMA machine, so try to be as
     993             :  * frugal as possible.
     994             :  */
     995             : unsigned long sum_zone_node_page_state(int node,
     996             :                                  enum zone_stat_item item)
     997             : {
     998             :         struct zone *zones = NODE_DATA(node)->node_zones;
     999             :         int i;
    1000             :         unsigned long count = 0;
    1001             : 
    1002             :         for (i = 0; i < MAX_NR_ZONES; i++)
    1003             :                 count += zone_page_state(zones + i, item);
    1004             : 
    1005             :         return count;
    1006             : }
    1007             : 
    1008             : /* Determine the per node value of a numa stat item. */
    1009             : unsigned long sum_zone_numa_event_state(int node,
    1010             :                                  enum numa_stat_item item)
    1011             : {
    1012             :         struct zone *zones = NODE_DATA(node)->node_zones;
    1013             :         unsigned long count = 0;
    1014             :         int i;
    1015             : 
    1016             :         for (i = 0; i < MAX_NR_ZONES; i++)
    1017             :                 count += zone_numa_event_state(zones + i, item);
    1018             : 
    1019             :         return count;
    1020             : }
    1021             : 
    1022             : /*
    1023             :  * Determine the per node value of a stat item.
    1024             :  */
    1025             : unsigned long node_page_state_pages(struct pglist_data *pgdat,
    1026             :                                     enum node_stat_item item)
    1027             : {
    1028             :         long x = atomic_long_read(&pgdat->vm_stat[item]);
    1029             : #ifdef CONFIG_SMP
    1030             :         if (x < 0)
    1031             :                 x = 0;
    1032             : #endif
    1033             :         return x;
    1034             : }
    1035             : 
    1036             : unsigned long node_page_state(struct pglist_data *pgdat,
    1037             :                               enum node_stat_item item)
    1038             : {
    1039             :         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
    1040             : 
    1041             :         return node_page_state_pages(pgdat, item);
    1042             : }
    1043             : #endif
    1044             : 
    1045             : #ifdef CONFIG_COMPACTION
    1046             : 
    1047             : struct contig_page_info {
    1048             :         unsigned long free_pages;
    1049             :         unsigned long free_blocks_total;
    1050             :         unsigned long free_blocks_suitable;
    1051             : };
    1052             : 
    1053             : /*
    1054             :  * Calculate the number of free pages in a zone, how many contiguous
    1055             :  * pages are free and how many are large enough to satisfy an allocation of
    1056             :  * the target size. Note that this function makes no attempt to estimate
    1057             :  * how many suitable free blocks there *might* be if MOVABLE pages were
    1058             :  * migrated. Calculating that is possible, but expensive and can be
    1059             :  * figured out from userspace
    1060             :  */
    1061             : static void fill_contig_page_info(struct zone *zone,
    1062             :                                 unsigned int suitable_order,
    1063             :                                 struct contig_page_info *info)
    1064             : {
    1065             :         unsigned int order;
    1066             : 
    1067           0 :         info->free_pages = 0;
    1068           0 :         info->free_blocks_total = 0;
    1069           0 :         info->free_blocks_suitable = 0;
    1070             : 
    1071           0 :         for (order = 0; order < MAX_ORDER; order++) {
    1072             :                 unsigned long blocks;
    1073             : 
    1074             :                 /*
    1075             :                  * Count number of free blocks.
    1076             :                  *
    1077             :                  * Access to nr_free is lockless as nr_free is used only for
    1078             :                  * diagnostic purposes. Use data_race to avoid KCSAN warning.
    1079             :                  */
    1080           0 :                 blocks = data_race(zone->free_area[order].nr_free);
    1081           0 :                 info->free_blocks_total += blocks;
    1082             : 
    1083             :                 /* Count free base pages */
    1084           0 :                 info->free_pages += blocks << order;
    1085             : 
    1086             :                 /* Count the suitable free blocks */
    1087           0 :                 if (order >= suitable_order)
    1088           0 :                         info->free_blocks_suitable += blocks <<
    1089           0 :                                                 (order - suitable_order);
    1090             :         }
    1091             : }
    1092             : 
    1093             : /*
    1094             :  * A fragmentation index only makes sense if an allocation of a requested
    1095             :  * size would fail. If that is true, the fragmentation index indicates
    1096             :  * whether external fragmentation or a lack of memory was the problem.
    1097             :  * The value can be used to determine if page reclaim or compaction
    1098             :  * should be used
    1099             :  */
    1100           0 : static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
    1101             : {
    1102           0 :         unsigned long requested = 1UL << order;
    1103             : 
    1104           0 :         if (WARN_ON_ONCE(order >= MAX_ORDER))
    1105             :                 return 0;
    1106             : 
    1107           0 :         if (!info->free_blocks_total)
    1108             :                 return 0;
    1109             : 
    1110             :         /* Fragmentation index only makes sense when a request would fail */
    1111           0 :         if (info->free_blocks_suitable)
    1112             :                 return -1000;
    1113             : 
    1114             :         /*
    1115             :          * Index is between 0 and 1 so return within 3 decimal places
    1116             :          *
    1117             :          * 0 => allocation would fail due to lack of memory
    1118             :          * 1 => allocation would fail due to fragmentation
    1119             :          */
    1120           0 :         return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
    1121             : }
    1122             : 
    1123             : /*
    1124             :  * Calculates external fragmentation within a zone wrt the given order.
    1125             :  * It is defined as the percentage of pages found in blocks of size
    1126             :  * less than 1 << order. It returns values in range [0, 100].
    1127             :  */
    1128           0 : unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
    1129             : {
    1130             :         struct contig_page_info info;
    1131             : 
    1132           0 :         fill_contig_page_info(zone, order, &info);
    1133           0 :         if (info.free_pages == 0)
    1134             :                 return 0;
    1135             : 
    1136           0 :         return div_u64((info.free_pages -
    1137           0 :                         (info.free_blocks_suitable << order)) * 100,
    1138             :                         info.free_pages);
    1139             : }
    1140             : 
    1141             : /* Same as __fragmentation index but allocs contig_page_info on stack */
    1142           0 : int fragmentation_index(struct zone *zone, unsigned int order)
    1143             : {
    1144             :         struct contig_page_info info;
    1145             : 
    1146           0 :         fill_contig_page_info(zone, order, &info);
    1147           0 :         return __fragmentation_index(order, &info);
    1148             : }
    1149             : #endif
    1150             : 
    1151             : #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
    1152             :     defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
    1153             : #ifdef CONFIG_ZONE_DMA
    1154             : #define TEXT_FOR_DMA(xx) xx "_dma",
    1155             : #else
    1156             : #define TEXT_FOR_DMA(xx)
    1157             : #endif
    1158             : 
    1159             : #ifdef CONFIG_ZONE_DMA32
    1160             : #define TEXT_FOR_DMA32(xx) xx "_dma32",
    1161             : #else
    1162             : #define TEXT_FOR_DMA32(xx)
    1163             : #endif
    1164             : 
    1165             : #ifdef CONFIG_HIGHMEM
    1166             : #define TEXT_FOR_HIGHMEM(xx) xx "_high",
    1167             : #else
    1168             : #define TEXT_FOR_HIGHMEM(xx)
    1169             : #endif
    1170             : 
    1171             : #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
    1172             :                                         TEXT_FOR_HIGHMEM(xx) xx "_movable",
    1173             : 
    1174             : const char * const vmstat_text[] = {
    1175             :         /* enum zone_stat_item counters */
    1176             :         "nr_free_pages",
    1177             :         "nr_zone_inactive_anon",
    1178             :         "nr_zone_active_anon",
    1179             :         "nr_zone_inactive_file",
    1180             :         "nr_zone_active_file",
    1181             :         "nr_zone_unevictable",
    1182             :         "nr_zone_write_pending",
    1183             :         "nr_mlock",
    1184             :         "nr_bounce",
    1185             : #if IS_ENABLED(CONFIG_ZSMALLOC)
    1186             :         "nr_zspages",
    1187             : #endif
    1188             :         "nr_free_cma",
    1189             : 
    1190             :         /* enum numa_stat_item counters */
    1191             : #ifdef CONFIG_NUMA
    1192             :         "numa_hit",
    1193             :         "numa_miss",
    1194             :         "numa_foreign",
    1195             :         "numa_interleave",
    1196             :         "numa_local",
    1197             :         "numa_other",
    1198             : #endif
    1199             : 
    1200             :         /* enum node_stat_item counters */
    1201             :         "nr_inactive_anon",
    1202             :         "nr_active_anon",
    1203             :         "nr_inactive_file",
    1204             :         "nr_active_file",
    1205             :         "nr_unevictable",
    1206             :         "nr_slab_reclaimable",
    1207             :         "nr_slab_unreclaimable",
    1208             :         "nr_isolated_anon",
    1209             :         "nr_isolated_file",
    1210             :         "workingset_nodes",
    1211             :         "workingset_refault_anon",
    1212             :         "workingset_refault_file",
    1213             :         "workingset_activate_anon",
    1214             :         "workingset_activate_file",
    1215             :         "workingset_restore_anon",
    1216             :         "workingset_restore_file",
    1217             :         "workingset_nodereclaim",
    1218             :         "nr_anon_pages",
    1219             :         "nr_mapped",
    1220             :         "nr_file_pages",
    1221             :         "nr_dirty",
    1222             :         "nr_writeback",
    1223             :         "nr_writeback_temp",
    1224             :         "nr_shmem",
    1225             :         "nr_shmem_hugepages",
    1226             :         "nr_shmem_pmdmapped",
    1227             :         "nr_file_hugepages",
    1228             :         "nr_file_pmdmapped",
    1229             :         "nr_anon_transparent_hugepages",
    1230             :         "nr_vmscan_write",
    1231             :         "nr_vmscan_immediate_reclaim",
    1232             :         "nr_dirtied",
    1233             :         "nr_written",
    1234             :         "nr_throttled_written",
    1235             :         "nr_kernel_misc_reclaimable",
    1236             :         "nr_foll_pin_acquired",
    1237             :         "nr_foll_pin_released",
    1238             :         "nr_kernel_stack",
    1239             : #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
    1240             :         "nr_shadow_call_stack",
    1241             : #endif
    1242             :         "nr_page_table_pages",
    1243             : #ifdef CONFIG_SWAP
    1244             :         "nr_swapcached",
    1245             : #endif
    1246             : #ifdef CONFIG_NUMA_BALANCING
    1247             :         "pgpromote_success",
    1248             : #endif
    1249             : 
    1250             :         /* enum writeback_stat_item counters */
    1251             :         "nr_dirty_threshold",
    1252             :         "nr_dirty_background_threshold",
    1253             : 
    1254             : #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
    1255             :         /* enum vm_event_item counters */
    1256             :         "pgpgin",
    1257             :         "pgpgout",
    1258             :         "pswpin",
    1259             :         "pswpout",
    1260             : 
    1261             :         TEXTS_FOR_ZONES("pgalloc")
    1262             :         TEXTS_FOR_ZONES("allocstall")
    1263             :         TEXTS_FOR_ZONES("pgskip")
    1264             : 
    1265             :         "pgfree",
    1266             :         "pgactivate",
    1267             :         "pgdeactivate",
    1268             :         "pglazyfree",
    1269             : 
    1270             :         "pgfault",
    1271             :         "pgmajfault",
    1272             :         "pglazyfreed",
    1273             : 
    1274             :         "pgrefill",
    1275             :         "pgreuse",
    1276             :         "pgsteal_kswapd",
    1277             :         "pgsteal_direct",
    1278             :         "pgdemote_kswapd",
    1279             :         "pgdemote_direct",
    1280             :         "pgscan_kswapd",
    1281             :         "pgscan_direct",
    1282             :         "pgscan_direct_throttle",
    1283             :         "pgscan_anon",
    1284             :         "pgscan_file",
    1285             :         "pgsteal_anon",
    1286             :         "pgsteal_file",
    1287             : 
    1288             : #ifdef CONFIG_NUMA
    1289             :         "zone_reclaim_failed",
    1290             : #endif
    1291             :         "pginodesteal",
    1292             :         "slabs_scanned",
    1293             :         "kswapd_inodesteal",
    1294             :         "kswapd_low_wmark_hit_quickly",
    1295             :         "kswapd_high_wmark_hit_quickly",
    1296             :         "pageoutrun",
    1297             : 
    1298             :         "pgrotated",
    1299             : 
    1300             :         "drop_pagecache",
    1301             :         "drop_slab",
    1302             :         "oom_kill",
    1303             : 
    1304             : #ifdef CONFIG_NUMA_BALANCING
    1305             :         "numa_pte_updates",
    1306             :         "numa_huge_pte_updates",
    1307             :         "numa_hint_faults",
    1308             :         "numa_hint_faults_local",
    1309             :         "numa_pages_migrated",
    1310             : #endif
    1311             : #ifdef CONFIG_MIGRATION
    1312             :         "pgmigrate_success",
    1313             :         "pgmigrate_fail",
    1314             :         "thp_migration_success",
    1315             :         "thp_migration_fail",
    1316             :         "thp_migration_split",
    1317             : #endif
    1318             : #ifdef CONFIG_COMPACTION
    1319             :         "compact_migrate_scanned",
    1320             :         "compact_free_scanned",
    1321             :         "compact_isolated",
    1322             :         "compact_stall",
    1323             :         "compact_fail",
    1324             :         "compact_success",
    1325             :         "compact_daemon_wake",
    1326             :         "compact_daemon_migrate_scanned",
    1327             :         "compact_daemon_free_scanned",
    1328             : #endif
    1329             : 
    1330             : #ifdef CONFIG_HUGETLB_PAGE
    1331             :         "htlb_buddy_alloc_success",
    1332             :         "htlb_buddy_alloc_fail",
    1333             : #endif
    1334             : #ifdef CONFIG_CMA
    1335             :         "cma_alloc_success",
    1336             :         "cma_alloc_fail",
    1337             : #endif
    1338             :         "unevictable_pgs_culled",
    1339             :         "unevictable_pgs_scanned",
    1340             :         "unevictable_pgs_rescued",
    1341             :         "unevictable_pgs_mlocked",
    1342             :         "unevictable_pgs_munlocked",
    1343             :         "unevictable_pgs_cleared",
    1344             :         "unevictable_pgs_stranded",
    1345             : 
    1346             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    1347             :         "thp_fault_alloc",
    1348             :         "thp_fault_fallback",
    1349             :         "thp_fault_fallback_charge",
    1350             :         "thp_collapse_alloc",
    1351             :         "thp_collapse_alloc_failed",
    1352             :         "thp_file_alloc",
    1353             :         "thp_file_fallback",
    1354             :         "thp_file_fallback_charge",
    1355             :         "thp_file_mapped",
    1356             :         "thp_split_page",
    1357             :         "thp_split_page_failed",
    1358             :         "thp_deferred_split_page",
    1359             :         "thp_split_pmd",
    1360             :         "thp_scan_exceed_none_pte",
    1361             :         "thp_scan_exceed_swap_pte",
    1362             :         "thp_scan_exceed_share_pte",
    1363             : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    1364             :         "thp_split_pud",
    1365             : #endif
    1366             :         "thp_zero_page_alloc",
    1367             :         "thp_zero_page_alloc_failed",
    1368             :         "thp_swpout",
    1369             :         "thp_swpout_fallback",
    1370             : #endif
    1371             : #ifdef CONFIG_MEMORY_BALLOON
    1372             :         "balloon_inflate",
    1373             :         "balloon_deflate",
    1374             : #ifdef CONFIG_BALLOON_COMPACTION
    1375             :         "balloon_migrate",
    1376             : #endif
    1377             : #endif /* CONFIG_MEMORY_BALLOON */
    1378             : #ifdef CONFIG_DEBUG_TLBFLUSH
    1379             :         "nr_tlb_remote_flush",
    1380             :         "nr_tlb_remote_flush_received",
    1381             :         "nr_tlb_local_flush_all",
    1382             :         "nr_tlb_local_flush_one",
    1383             : #endif /* CONFIG_DEBUG_TLBFLUSH */
    1384             : 
    1385             : #ifdef CONFIG_DEBUG_VM_VMACACHE
    1386             :         "vmacache_find_calls",
    1387             :         "vmacache_find_hits",
    1388             : #endif
    1389             : #ifdef CONFIG_SWAP
    1390             :         "swap_ra",
    1391             :         "swap_ra_hit",
    1392             : #ifdef CONFIG_KSM
    1393             :         "ksm_swpin_copy",
    1394             : #endif
    1395             : #endif
    1396             : #ifdef CONFIG_X86
    1397             :         "direct_map_level2_splits",
    1398             :         "direct_map_level3_splits",
    1399             : #endif
    1400             : #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
    1401             : };
    1402             : #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
    1403             : 
    1404             : #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
    1405             :      defined(CONFIG_PROC_FS)
    1406           0 : static void *frag_start(struct seq_file *m, loff_t *pos)
    1407             : {
    1408             :         pg_data_t *pgdat;
    1409           0 :         loff_t node = *pos;
    1410             : 
    1411           0 :         for (pgdat = first_online_pgdat();
    1412           0 :              pgdat && node;
    1413           0 :              pgdat = next_online_pgdat(pgdat))
    1414           0 :                 --node;
    1415             : 
    1416           0 :         return pgdat;
    1417             : }
    1418             : 
    1419           0 : static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
    1420             : {
    1421           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1422             : 
    1423           0 :         (*pos)++;
    1424           0 :         return next_online_pgdat(pgdat);
    1425             : }
    1426             : 
    1427           0 : static void frag_stop(struct seq_file *m, void *arg)
    1428             : {
    1429           0 : }
    1430             : 
    1431             : /*
    1432             :  * Walk zones in a node and print using a callback.
    1433             :  * If @assert_populated is true, only use callback for zones that are populated.
    1434             :  */
    1435           0 : static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
    1436             :                 bool assert_populated, bool nolock,
    1437             :                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
    1438             : {
    1439             :         struct zone *zone;
    1440           0 :         struct zone *node_zones = pgdat->node_zones;
    1441             :         unsigned long flags;
    1442             : 
    1443           0 :         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
    1444           0 :                 if (assert_populated && !populated_zone(zone))
    1445           0 :                         continue;
    1446             : 
    1447           0 :                 if (!nolock)
    1448           0 :                         spin_lock_irqsave(&zone->lock, flags);
    1449           0 :                 print(m, pgdat, zone);
    1450           0 :                 if (!nolock)
    1451           0 :                         spin_unlock_irqrestore(&zone->lock, flags);
    1452             :         }
    1453           0 : }
    1454             : #endif
    1455             : 
    1456             : #ifdef CONFIG_PROC_FS
    1457           0 : static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
    1458             :                                                 struct zone *zone)
    1459             : {
    1460             :         int order;
    1461             : 
    1462           0 :         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
    1463           0 :         for (order = 0; order < MAX_ORDER; ++order)
    1464             :                 /*
    1465             :                  * Access to nr_free is lockless as nr_free is used only for
    1466             :                  * printing purposes. Use data_race to avoid KCSAN warning.
    1467             :                  */
    1468           0 :                 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
    1469           0 :         seq_putc(m, '\n');
    1470           0 : }
    1471             : 
    1472             : /*
    1473             :  * This walks the free areas for each zone.
    1474             :  */
    1475           0 : static int frag_show(struct seq_file *m, void *arg)
    1476             : {
    1477           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1478           0 :         walk_zones_in_node(m, pgdat, true, false, frag_show_print);
    1479           0 :         return 0;
    1480             : }
    1481             : 
    1482           0 : static void pagetypeinfo_showfree_print(struct seq_file *m,
    1483             :                                         pg_data_t *pgdat, struct zone *zone)
    1484             : {
    1485             :         int order, mtype;
    1486             : 
    1487           0 :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
    1488           0 :                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
    1489             :                                         pgdat->node_id,
    1490             :                                         zone->name,
    1491             :                                         migratetype_names[mtype]);
    1492           0 :                 for (order = 0; order < MAX_ORDER; ++order) {
    1493           0 :                         unsigned long freecount = 0;
    1494             :                         struct free_area *area;
    1495             :                         struct list_head *curr;
    1496           0 :                         bool overflow = false;
    1497             : 
    1498           0 :                         area = &(zone->free_area[order]);
    1499             : 
    1500           0 :                         list_for_each(curr, &area->free_list[mtype]) {
    1501             :                                 /*
    1502             :                                  * Cap the free_list iteration because it might
    1503             :                                  * be really large and we are under a spinlock
    1504             :                                  * so a long time spent here could trigger a
    1505             :                                  * hard lockup detector. Anyway this is a
    1506             :                                  * debugging tool so knowing there is a handful
    1507             :                                  * of pages of this order should be more than
    1508             :                                  * sufficient.
    1509             :                                  */
    1510           0 :                                 if (++freecount >= 100000) {
    1511             :                                         overflow = true;
    1512             :                                         break;
    1513             :                                 }
    1514             :                         }
    1515           0 :                         seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
    1516           0 :                         spin_unlock_irq(&zone->lock);
    1517           0 :                         cond_resched();
    1518           0 :                         spin_lock_irq(&zone->lock);
    1519             :                 }
    1520           0 :                 seq_putc(m, '\n');
    1521             :         }
    1522           0 : }
    1523             : 
    1524             : /* Print out the free pages at each order for each migatetype */
    1525           0 : static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
    1526             : {
    1527             :         int order;
    1528           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1529             : 
    1530             :         /* Print header */
    1531           0 :         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
    1532           0 :         for (order = 0; order < MAX_ORDER; ++order)
    1533           0 :                 seq_printf(m, "%6d ", order);
    1534           0 :         seq_putc(m, '\n');
    1535             : 
    1536           0 :         walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
    1537           0 : }
    1538             : 
    1539           0 : static void pagetypeinfo_showblockcount_print(struct seq_file *m,
    1540             :                                         pg_data_t *pgdat, struct zone *zone)
    1541             : {
    1542             :         int mtype;
    1543             :         unsigned long pfn;
    1544           0 :         unsigned long start_pfn = zone->zone_start_pfn;
    1545           0 :         unsigned long end_pfn = zone_end_pfn(zone);
    1546           0 :         unsigned long count[MIGRATE_TYPES] = { 0, };
    1547             : 
    1548           0 :         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
    1549             :                 struct page *page;
    1550             : 
    1551           0 :                 page = pfn_to_online_page(pfn);
    1552           0 :                 if (!page)
    1553           0 :                         continue;
    1554             : 
    1555           0 :                 if (page_zone(page) != zone)
    1556           0 :                         continue;
    1557             : 
    1558           0 :                 mtype = get_pageblock_migratetype(page);
    1559             : 
    1560           0 :                 if (mtype < MIGRATE_TYPES)
    1561           0 :                         count[mtype]++;
    1562             :         }
    1563             : 
    1564             :         /* Print counts */
    1565           0 :         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
    1566           0 :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
    1567           0 :                 seq_printf(m, "%12lu ", count[mtype]);
    1568           0 :         seq_putc(m, '\n');
    1569           0 : }
    1570             : 
    1571             : /* Print out the number of pageblocks for each migratetype */
    1572           0 : static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
    1573             : {
    1574             :         int mtype;
    1575           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1576             : 
    1577           0 :         seq_printf(m, "\n%-23s", "Number of blocks type ");
    1578           0 :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
    1579           0 :                 seq_printf(m, "%12s ", migratetype_names[mtype]);
    1580           0 :         seq_putc(m, '\n');
    1581           0 :         walk_zones_in_node(m, pgdat, true, false,
    1582             :                 pagetypeinfo_showblockcount_print);
    1583           0 : }
    1584             : 
    1585             : /*
    1586             :  * Print out the number of pageblocks for each migratetype that contain pages
    1587             :  * of other types. This gives an indication of how well fallbacks are being
    1588             :  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
    1589             :  * to determine what is going on
    1590             :  */
    1591             : static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
    1592             : {
    1593             : #ifdef CONFIG_PAGE_OWNER
    1594             :         int mtype;
    1595             : 
    1596             :         if (!static_branch_unlikely(&page_owner_inited))
    1597             :                 return;
    1598             : 
    1599             :         drain_all_pages(NULL);
    1600             : 
    1601             :         seq_printf(m, "\n%-23s", "Number of mixed blocks ");
    1602             :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
    1603             :                 seq_printf(m, "%12s ", migratetype_names[mtype]);
    1604             :         seq_putc(m, '\n');
    1605             : 
    1606             :         walk_zones_in_node(m, pgdat, true, true,
    1607             :                 pagetypeinfo_showmixedcount_print);
    1608             : #endif /* CONFIG_PAGE_OWNER */
    1609             : }
    1610             : 
    1611             : /*
    1612             :  * This prints out statistics in relation to grouping pages by mobility.
    1613             :  * It is expensive to collect so do not constantly read the file.
    1614             :  */
    1615           0 : static int pagetypeinfo_show(struct seq_file *m, void *arg)
    1616             : {
    1617           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1618             : 
    1619             :         /* check memoryless node */
    1620           0 :         if (!node_state(pgdat->node_id, N_MEMORY))
    1621             :                 return 0;
    1622             : 
    1623           0 :         seq_printf(m, "Page block order: %d\n", pageblock_order);
    1624           0 :         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
    1625           0 :         seq_putc(m, '\n');
    1626           0 :         pagetypeinfo_showfree(m, pgdat);
    1627           0 :         pagetypeinfo_showblockcount(m, pgdat);
    1628           0 :         pagetypeinfo_showmixedcount(m, pgdat);
    1629             : 
    1630           0 :         return 0;
    1631             : }
    1632             : 
    1633             : static const struct seq_operations fragmentation_op = {
    1634             :         .start  = frag_start,
    1635             :         .next   = frag_next,
    1636             :         .stop   = frag_stop,
    1637             :         .show   = frag_show,
    1638             : };
    1639             : 
    1640             : static const struct seq_operations pagetypeinfo_op = {
    1641             :         .start  = frag_start,
    1642             :         .next   = frag_next,
    1643             :         .stop   = frag_stop,
    1644             :         .show   = pagetypeinfo_show,
    1645             : };
    1646             : 
    1647             : static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
    1648             : {
    1649             :         int zid;
    1650             : 
    1651           0 :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    1652           0 :                 struct zone *compare = &pgdat->node_zones[zid];
    1653             : 
    1654           0 :                 if (populated_zone(compare))
    1655           0 :                         return zone == compare;
    1656             :         }
    1657             : 
    1658             :         return false;
    1659             : }
    1660             : 
    1661           0 : static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
    1662             :                                                         struct zone *zone)
    1663             : {
    1664             :         int i;
    1665           0 :         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
    1666           0 :         if (is_zone_first_populated(pgdat, zone)) {
    1667           0 :                 seq_printf(m, "\n  per-node stats");
    1668           0 :                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
    1669           0 :                         unsigned long pages = node_page_state_pages(pgdat, i);
    1670             : 
    1671           0 :                         if (vmstat_item_print_in_thp(i))
    1672             :                                 pages /= HPAGE_PMD_NR;
    1673           0 :                         seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
    1674             :                                    pages);
    1675             :                 }
    1676             :         }
    1677           0 :         seq_printf(m,
    1678             :                    "\n  pages free     %lu"
    1679             :                    "\n        boost    %lu"
    1680             :                    "\n        min      %lu"
    1681             :                    "\n        low      %lu"
    1682             :                    "\n        high     %lu"
    1683             :                    "\n        spanned  %lu"
    1684             :                    "\n        present  %lu"
    1685             :                    "\n        managed  %lu"
    1686             :                    "\n        cma      %lu",
    1687             :                    zone_page_state(zone, NR_FREE_PAGES),
    1688             :                    zone->watermark_boost,
    1689           0 :                    min_wmark_pages(zone),
    1690           0 :                    low_wmark_pages(zone),
    1691           0 :                    high_wmark_pages(zone),
    1692             :                    zone->spanned_pages,
    1693             :                    zone->present_pages,
    1694             :                    zone_managed_pages(zone),
    1695             :                    zone_cma_pages(zone));
    1696             : 
    1697           0 :         seq_printf(m,
    1698             :                    "\n        protection: (%ld",
    1699             :                    zone->lowmem_reserve[0]);
    1700           0 :         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
    1701           0 :                 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
    1702           0 :         seq_putc(m, ')');
    1703             : 
    1704             :         /* If unpopulated, no other information is useful */
    1705           0 :         if (!populated_zone(zone)) {
    1706           0 :                 seq_putc(m, '\n');
    1707           0 :                 return;
    1708             :         }
    1709             : 
    1710           0 :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
    1711           0 :                 seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
    1712             :                            zone_page_state(zone, i));
    1713             : 
    1714             : #ifdef CONFIG_NUMA
    1715             :         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
    1716             :                 seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
    1717             :                            zone_numa_event_state(zone, i));
    1718             : #endif
    1719             : 
    1720           0 :         seq_printf(m, "\n  pagesets");
    1721           0 :         for_each_online_cpu(i) {
    1722             :                 struct per_cpu_pages *pcp;
    1723             :                 struct per_cpu_zonestat __maybe_unused *pzstats;
    1724             : 
    1725           0 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
    1726           0 :                 seq_printf(m,
    1727             :                            "\n    cpu: %i"
    1728             :                            "\n              count: %i"
    1729             :                            "\n              high:  %i"
    1730             :                            "\n              batch: %i",
    1731             :                            i,
    1732             :                            pcp->count,
    1733             :                            pcp->high,
    1734             :                            pcp->batch);
    1735             : #ifdef CONFIG_SMP
    1736             :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
    1737             :                 seq_printf(m, "\n  vm stats threshold: %d",
    1738             :                                 pzstats->stat_threshold);
    1739             : #endif
    1740             :         }
    1741           0 :         seq_printf(m,
    1742             :                    "\n  node_unreclaimable:  %u"
    1743             :                    "\n  start_pfn:           %lu",
    1744           0 :                    pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
    1745             :                    zone->zone_start_pfn);
    1746           0 :         seq_putc(m, '\n');
    1747             : }
    1748             : 
    1749             : /*
    1750             :  * Output information about zones in @pgdat.  All zones are printed regardless
    1751             :  * of whether they are populated or not: lowmem_reserve_ratio operates on the
    1752             :  * set of all zones and userspace would not be aware of such zones if they are
    1753             :  * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
    1754             :  */
    1755           0 : static int zoneinfo_show(struct seq_file *m, void *arg)
    1756             : {
    1757           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1758           0 :         walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
    1759           0 :         return 0;
    1760             : }
    1761             : 
    1762             : static const struct seq_operations zoneinfo_op = {
    1763             :         .start  = frag_start, /* iterate over all zones. The same as in
    1764             :                                * fragmentation. */
    1765             :         .next   = frag_next,
    1766             :         .stop   = frag_stop,
    1767             :         .show   = zoneinfo_show,
    1768             : };
    1769             : 
    1770             : #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
    1771             :                          NR_VM_NUMA_EVENT_ITEMS + \
    1772             :                          NR_VM_NODE_STAT_ITEMS + \
    1773             :                          NR_VM_WRITEBACK_STAT_ITEMS + \
    1774             :                          (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
    1775             :                           NR_VM_EVENT_ITEMS : 0))
    1776             : 
    1777           0 : static void *vmstat_start(struct seq_file *m, loff_t *pos)
    1778             : {
    1779             :         unsigned long *v;
    1780             :         int i;
    1781             : 
    1782           0 :         if (*pos >= NR_VMSTAT_ITEMS)
    1783             :                 return NULL;
    1784             : 
    1785             :         BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
    1786             :         fold_vm_numa_events();
    1787           0 :         v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
    1788           0 :         m->private = v;
    1789           0 :         if (!v)
    1790             :                 return ERR_PTR(-ENOMEM);
    1791           0 :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
    1792           0 :                 v[i] = global_zone_page_state(i);
    1793             :         v += NR_VM_ZONE_STAT_ITEMS;
    1794             : 
    1795             : #ifdef CONFIG_NUMA
    1796             :         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
    1797             :                 v[i] = global_numa_event_state(i);
    1798             :         v += NR_VM_NUMA_EVENT_ITEMS;
    1799             : #endif
    1800             : 
    1801           0 :         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
    1802           0 :                 v[i] = global_node_page_state_pages(i);
    1803           0 :                 if (vmstat_item_print_in_thp(i))
    1804             :                         v[i] /= HPAGE_PMD_NR;
    1805             :         }
    1806           0 :         v += NR_VM_NODE_STAT_ITEMS;
    1807             : 
    1808           0 :         global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
    1809             :                             v + NR_DIRTY_THRESHOLD);
    1810           0 :         v += NR_VM_WRITEBACK_STAT_ITEMS;
    1811             : 
    1812             : #ifdef CONFIG_VM_EVENT_COUNTERS
    1813           0 :         all_vm_events(v);
    1814           0 :         v[PGPGIN] /= 2;         /* sectors -> kbytes */
    1815           0 :         v[PGPGOUT] /= 2;
    1816             : #endif
    1817           0 :         return (unsigned long *)m->private + *pos;
    1818             : }
    1819             : 
    1820           0 : static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
    1821             : {
    1822           0 :         (*pos)++;
    1823           0 :         if (*pos >= NR_VMSTAT_ITEMS)
    1824             :                 return NULL;
    1825           0 :         return (unsigned long *)m->private + *pos;
    1826             : }
    1827             : 
    1828           0 : static int vmstat_show(struct seq_file *m, void *arg)
    1829             : {
    1830           0 :         unsigned long *l = arg;
    1831           0 :         unsigned long off = l - (unsigned long *)m->private;
    1832             : 
    1833           0 :         seq_puts(m, vmstat_text[off]);
    1834           0 :         seq_put_decimal_ull(m, " ", *l);
    1835           0 :         seq_putc(m, '\n');
    1836             : 
    1837           0 :         if (off == NR_VMSTAT_ITEMS - 1) {
    1838             :                 /*
    1839             :                  * We've come to the end - add any deprecated counters to avoid
    1840             :                  * breaking userspace which might depend on them being present.
    1841             :                  */
    1842           0 :                 seq_puts(m, "nr_unstable 0\n");
    1843             :         }
    1844           0 :         return 0;
    1845             : }
    1846             : 
    1847           0 : static void vmstat_stop(struct seq_file *m, void *arg)
    1848             : {
    1849           0 :         kfree(m->private);
    1850           0 :         m->private = NULL;
    1851           0 : }
    1852             : 
    1853             : static const struct seq_operations vmstat_op = {
    1854             :         .start  = vmstat_start,
    1855             :         .next   = vmstat_next,
    1856             :         .stop   = vmstat_stop,
    1857             :         .show   = vmstat_show,
    1858             : };
    1859             : #endif /* CONFIG_PROC_FS */
    1860             : 
    1861             : #ifdef CONFIG_SMP
    1862             : static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
    1863             : int sysctl_stat_interval __read_mostly = HZ;
    1864             : 
    1865             : #ifdef CONFIG_PROC_FS
    1866             : static void refresh_vm_stats(struct work_struct *work)
    1867             : {
    1868             :         refresh_cpu_vm_stats(true);
    1869             : }
    1870             : 
    1871             : int vmstat_refresh(struct ctl_table *table, int write,
    1872             :                    void *buffer, size_t *lenp, loff_t *ppos)
    1873             : {
    1874             :         long val;
    1875             :         int err;
    1876             :         int i;
    1877             : 
    1878             :         /*
    1879             :          * The regular update, every sysctl_stat_interval, may come later
    1880             :          * than expected: leaving a significant amount in per_cpu buckets.
    1881             :          * This is particularly misleading when checking a quantity of HUGE
    1882             :          * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
    1883             :          * which can equally be echo'ed to or cat'ted from (by root),
    1884             :          * can be used to update the stats just before reading them.
    1885             :          *
    1886             :          * Oh, and since global_zone_page_state() etc. are so careful to hide
    1887             :          * transiently negative values, report an error here if any of
    1888             :          * the stats is negative, so we know to go looking for imbalance.
    1889             :          */
    1890             :         err = schedule_on_each_cpu(refresh_vm_stats);
    1891             :         if (err)
    1892             :                 return err;
    1893             :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
    1894             :                 /*
    1895             :                  * Skip checking stats known to go negative occasionally.
    1896             :                  */
    1897             :                 switch (i) {
    1898             :                 case NR_ZONE_WRITE_PENDING:
    1899             :                 case NR_FREE_CMA_PAGES:
    1900             :                         continue;
    1901             :                 }
    1902             :                 val = atomic_long_read(&vm_zone_stat[i]);
    1903             :                 if (val < 0) {
    1904             :                         pr_warn("%s: %s %ld\n",
    1905             :                                 __func__, zone_stat_name(i), val);
    1906             :                 }
    1907             :         }
    1908             :         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
    1909             :                 /*
    1910             :                  * Skip checking stats known to go negative occasionally.
    1911             :                  */
    1912             :                 switch (i) {
    1913             :                 case NR_WRITEBACK:
    1914             :                         continue;
    1915             :                 }
    1916             :                 val = atomic_long_read(&vm_node_stat[i]);
    1917             :                 if (val < 0) {
    1918             :                         pr_warn("%s: %s %ld\n",
    1919             :                                 __func__, node_stat_name(i), val);
    1920             :                 }
    1921             :         }
    1922             :         if (write)
    1923             :                 *ppos += *lenp;
    1924             :         else
    1925             :                 *lenp = 0;
    1926             :         return 0;
    1927             : }
    1928             : #endif /* CONFIG_PROC_FS */
    1929             : 
    1930             : static void vmstat_update(struct work_struct *w)
    1931             : {
    1932             :         if (refresh_cpu_vm_stats(true)) {
    1933             :                 /*
    1934             :                  * Counters were updated so we expect more updates
    1935             :                  * to occur in the future. Keep on running the
    1936             :                  * update worker thread.
    1937             :                  */
    1938             :                 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
    1939             :                                 this_cpu_ptr(&vmstat_work),
    1940             :                                 round_jiffies_relative(sysctl_stat_interval));
    1941             :         }
    1942             : }
    1943             : 
    1944             : /*
    1945             :  * Check if the diffs for a certain cpu indicate that
    1946             :  * an update is needed.
    1947             :  */
    1948             : static bool need_update(int cpu)
    1949             : {
    1950             :         pg_data_t *last_pgdat = NULL;
    1951             :         struct zone *zone;
    1952             : 
    1953             :         for_each_populated_zone(zone) {
    1954             :                 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    1955             :                 struct per_cpu_nodestat *n;
    1956             : 
    1957             :                 /*
    1958             :                  * The fast way of checking if there are any vmstat diffs.
    1959             :                  */
    1960             :                 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
    1961             :                         return true;
    1962             : 
    1963             :                 if (last_pgdat == zone->zone_pgdat)
    1964             :                         continue;
    1965             :                 last_pgdat = zone->zone_pgdat;
    1966             :                 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
    1967             :                 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
    1968             :                         return true;
    1969             :         }
    1970             :         return false;
    1971             : }
    1972             : 
    1973             : /*
    1974             :  * Switch off vmstat processing and then fold all the remaining differentials
    1975             :  * until the diffs stay at zero. The function is used by NOHZ and can only be
    1976             :  * invoked when tick processing is not active.
    1977             :  */
    1978             : void quiet_vmstat(void)
    1979             : {
    1980             :         if (system_state != SYSTEM_RUNNING)
    1981             :                 return;
    1982             : 
    1983             :         if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
    1984             :                 return;
    1985             : 
    1986             :         if (!need_update(smp_processor_id()))
    1987             :                 return;
    1988             : 
    1989             :         /*
    1990             :          * Just refresh counters and do not care about the pending delayed
    1991             :          * vmstat_update. It doesn't fire that often to matter and canceling
    1992             :          * it would be too expensive from this path.
    1993             :          * vmstat_shepherd will take care about that for us.
    1994             :          */
    1995             :         refresh_cpu_vm_stats(false);
    1996             : }
    1997             : 
    1998             : /*
    1999             :  * Shepherd worker thread that checks the
    2000             :  * differentials of processors that have their worker
    2001             :  * threads for vm statistics updates disabled because of
    2002             :  * inactivity.
    2003             :  */
    2004             : static void vmstat_shepherd(struct work_struct *w);
    2005             : 
    2006             : static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
    2007             : 
    2008             : static void vmstat_shepherd(struct work_struct *w)
    2009             : {
    2010             :         int cpu;
    2011             : 
    2012             :         cpus_read_lock();
    2013             :         /* Check processors whose vmstat worker threads have been disabled */
    2014             :         for_each_online_cpu(cpu) {
    2015             :                 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
    2016             : 
    2017             :                 if (!delayed_work_pending(dw) && need_update(cpu))
    2018             :                         queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
    2019             : 
    2020             :                 cond_resched();
    2021             :         }
    2022             :         cpus_read_unlock();
    2023             : 
    2024             :         schedule_delayed_work(&shepherd,
    2025             :                 round_jiffies_relative(sysctl_stat_interval));
    2026             : }
    2027             : 
    2028             : static void __init start_shepherd_timer(void)
    2029             : {
    2030             :         int cpu;
    2031             : 
    2032             :         for_each_possible_cpu(cpu)
    2033             :                 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
    2034             :                         vmstat_update);
    2035             : 
    2036             :         schedule_delayed_work(&shepherd,
    2037             :                 round_jiffies_relative(sysctl_stat_interval));
    2038             : }
    2039             : 
    2040             : static void __init init_cpu_node_state(void)
    2041             : {
    2042             :         int node;
    2043             : 
    2044             :         for_each_online_node(node) {
    2045             :                 if (cpumask_weight(cpumask_of_node(node)) > 0)
    2046             :                         node_set_state(node, N_CPU);
    2047             :         }
    2048             : }
    2049             : 
    2050             : static int vmstat_cpu_online(unsigned int cpu)
    2051             : {
    2052             :         refresh_zone_stat_thresholds();
    2053             : 
    2054             :         if (!node_state(cpu_to_node(cpu), N_CPU)) {
    2055             :                 node_set_state(cpu_to_node(cpu), N_CPU);
    2056             :                 set_migration_target_nodes();
    2057             :         }
    2058             : 
    2059             :         return 0;
    2060             : }
    2061             : 
    2062             : static int vmstat_cpu_down_prep(unsigned int cpu)
    2063             : {
    2064             :         cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
    2065             :         return 0;
    2066             : }
    2067             : 
    2068             : static int vmstat_cpu_dead(unsigned int cpu)
    2069             : {
    2070             :         const struct cpumask *node_cpus;
    2071             :         int node;
    2072             : 
    2073             :         node = cpu_to_node(cpu);
    2074             : 
    2075             :         refresh_zone_stat_thresholds();
    2076             :         node_cpus = cpumask_of_node(node);
    2077             :         if (cpumask_weight(node_cpus) > 0)
    2078             :                 return 0;
    2079             : 
    2080             :         node_clear_state(node, N_CPU);
    2081             :         set_migration_target_nodes();
    2082             : 
    2083             :         return 0;
    2084             : }
    2085             : 
    2086             : #endif
    2087             : 
    2088             : struct workqueue_struct *mm_percpu_wq;
    2089             : 
    2090           1 : void __init init_mm_internals(void)
    2091             : {
    2092             :         int ret __maybe_unused;
    2093             : 
    2094           1 :         mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
    2095             : 
    2096             : #ifdef CONFIG_SMP
    2097             :         ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
    2098             :                                         NULL, vmstat_cpu_dead);
    2099             :         if (ret < 0)
    2100             :                 pr_err("vmstat: failed to register 'dead' hotplug state\n");
    2101             : 
    2102             :         ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
    2103             :                                         vmstat_cpu_online,
    2104             :                                         vmstat_cpu_down_prep);
    2105             :         if (ret < 0)
    2106             :                 pr_err("vmstat: failed to register 'online' hotplug state\n");
    2107             : 
    2108             :         cpus_read_lock();
    2109             :         init_cpu_node_state();
    2110             :         cpus_read_unlock();
    2111             : 
    2112             :         start_shepherd_timer();
    2113             : #endif
    2114             : #if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
    2115             :         migrate_on_reclaim_init();
    2116             : #endif
    2117             : #ifdef CONFIG_PROC_FS
    2118           1 :         proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
    2119           1 :         proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
    2120           1 :         proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
    2121           1 :         proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
    2122             : #endif
    2123           1 : }
    2124             : 
    2125             : #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
    2126             : 
    2127             : /*
    2128             :  * Return an index indicating how much of the available free memory is
    2129             :  * unusable for an allocation of the requested size.
    2130             :  */
    2131             : static int unusable_free_index(unsigned int order,
    2132             :                                 struct contig_page_info *info)
    2133             : {
    2134             :         /* No free memory is interpreted as all free memory is unusable */
    2135             :         if (info->free_pages == 0)
    2136             :                 return 1000;
    2137             : 
    2138             :         /*
    2139             :          * Index should be a value between 0 and 1. Return a value to 3
    2140             :          * decimal places.
    2141             :          *
    2142             :          * 0 => no fragmentation
    2143             :          * 1 => high fragmentation
    2144             :          */
    2145             :         return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
    2146             : 
    2147             : }
    2148             : 
    2149             : static void unusable_show_print(struct seq_file *m,
    2150             :                                         pg_data_t *pgdat, struct zone *zone)
    2151             : {
    2152             :         unsigned int order;
    2153             :         int index;
    2154             :         struct contig_page_info info;
    2155             : 
    2156             :         seq_printf(m, "Node %d, zone %8s ",
    2157             :                                 pgdat->node_id,
    2158             :                                 zone->name);
    2159             :         for (order = 0; order < MAX_ORDER; ++order) {
    2160             :                 fill_contig_page_info(zone, order, &info);
    2161             :                 index = unusable_free_index(order, &info);
    2162             :                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
    2163             :         }
    2164             : 
    2165             :         seq_putc(m, '\n');
    2166             : }
    2167             : 
    2168             : /*
    2169             :  * Display unusable free space index
    2170             :  *
    2171             :  * The unusable free space index measures how much of the available free
    2172             :  * memory cannot be used to satisfy an allocation of a given size and is a
    2173             :  * value between 0 and 1. The higher the value, the more of free memory is
    2174             :  * unusable and by implication, the worse the external fragmentation is. This
    2175             :  * can be expressed as a percentage by multiplying by 100.
    2176             :  */
    2177             : static int unusable_show(struct seq_file *m, void *arg)
    2178             : {
    2179             :         pg_data_t *pgdat = (pg_data_t *)arg;
    2180             : 
    2181             :         /* check memoryless node */
    2182             :         if (!node_state(pgdat->node_id, N_MEMORY))
    2183             :                 return 0;
    2184             : 
    2185             :         walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
    2186             : 
    2187             :         return 0;
    2188             : }
    2189             : 
    2190             : static const struct seq_operations unusable_sops = {
    2191             :         .start  = frag_start,
    2192             :         .next   = frag_next,
    2193             :         .stop   = frag_stop,
    2194             :         .show   = unusable_show,
    2195             : };
    2196             : 
    2197             : DEFINE_SEQ_ATTRIBUTE(unusable);
    2198             : 
    2199             : static void extfrag_show_print(struct seq_file *m,
    2200             :                                         pg_data_t *pgdat, struct zone *zone)
    2201             : {
    2202             :         unsigned int order;
    2203             :         int index;
    2204             : 
    2205             :         /* Alloc on stack as interrupts are disabled for zone walk */
    2206             :         struct contig_page_info info;
    2207             : 
    2208             :         seq_printf(m, "Node %d, zone %8s ",
    2209             :                                 pgdat->node_id,
    2210             :                                 zone->name);
    2211             :         for (order = 0; order < MAX_ORDER; ++order) {
    2212             :                 fill_contig_page_info(zone, order, &info);
    2213             :                 index = __fragmentation_index(order, &info);
    2214             :                 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
    2215             :         }
    2216             : 
    2217             :         seq_putc(m, '\n');
    2218             : }
    2219             : 
    2220             : /*
    2221             :  * Display fragmentation index for orders that allocations would fail for
    2222             :  */
    2223             : static int extfrag_show(struct seq_file *m, void *arg)
    2224             : {
    2225             :         pg_data_t *pgdat = (pg_data_t *)arg;
    2226             : 
    2227             :         walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
    2228             : 
    2229             :         return 0;
    2230             : }
    2231             : 
    2232             : static const struct seq_operations extfrag_sops = {
    2233             :         .start  = frag_start,
    2234             :         .next   = frag_next,
    2235             :         .stop   = frag_stop,
    2236             :         .show   = extfrag_show,
    2237             : };
    2238             : 
    2239             : DEFINE_SEQ_ATTRIBUTE(extfrag);
    2240             : 
    2241             : static int __init extfrag_debug_init(void)
    2242             : {
    2243             :         struct dentry *extfrag_debug_root;
    2244             : 
    2245             :         extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
    2246             : 
    2247             :         debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
    2248             :                             &unusable_fops);
    2249             : 
    2250             :         debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
    2251             :                             &extfrag_fops);
    2252             : 
    2253             :         return 0;
    2254             : }
    2255             : 
    2256             : module_init(extfrag_debug_init);
    2257             : #endif

Generated by: LCOV version 1.14