LCOV - code coverage report
Current view: top level - mm - oom_kill.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 13 333 3.9 %
Date: 2022-12-09 01:23:36 Functions: 3 30 10.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/oom_kill.c
       4             :  * 
       5             :  *  Copyright (C)  1998,2000  Rik van Riel
       6             :  *      Thanks go out to Claus Fischer for some serious inspiration and
       7             :  *      for goading me into coding this file...
       8             :  *  Copyright (C)  2010  Google, Inc.
       9             :  *      Rewritten by David Rientjes
      10             :  *
      11             :  *  The routines in this file are used to kill a process when
      12             :  *  we're seriously out of memory. This gets called from __alloc_pages()
      13             :  *  in mm/page_alloc.c when we really run out of memory.
      14             :  *
      15             :  *  Since we won't call these routines often (on a well-configured
      16             :  *  machine) this file will double as a 'coding guide' and a signpost
      17             :  *  for newbie kernel hackers. It features several pointers to major
      18             :  *  kernel subsystems and hints as to where to find out what things do.
      19             :  */
      20             : 
      21             : #include <linux/oom.h>
      22             : #include <linux/mm.h>
      23             : #include <linux/err.h>
      24             : #include <linux/gfp.h>
      25             : #include <linux/sched.h>
      26             : #include <linux/sched/mm.h>
      27             : #include <linux/sched/coredump.h>
      28             : #include <linux/sched/task.h>
      29             : #include <linux/sched/debug.h>
      30             : #include <linux/swap.h>
      31             : #include <linux/syscalls.h>
      32             : #include <linux/timex.h>
      33             : #include <linux/jiffies.h>
      34             : #include <linux/cpuset.h>
      35             : #include <linux/export.h>
      36             : #include <linux/notifier.h>
      37             : #include <linux/memcontrol.h>
      38             : #include <linux/mempolicy.h>
      39             : #include <linux/security.h>
      40             : #include <linux/ptrace.h>
      41             : #include <linux/freezer.h>
      42             : #include <linux/ftrace.h>
      43             : #include <linux/ratelimit.h>
      44             : #include <linux/kthread.h>
      45             : #include <linux/init.h>
      46             : #include <linux/mmu_notifier.h>
      47             : 
      48             : #include <asm/tlb.h>
      49             : #include "internal.h"
      50             : #include "slab.h"
      51             : 
      52             : #define CREATE_TRACE_POINTS
      53             : #include <trace/events/oom.h>
      54             : 
      55             : int sysctl_panic_on_oom;
      56             : int sysctl_oom_kill_allocating_task;
      57             : int sysctl_oom_dump_tasks = 1;
      58             : 
      59             : /*
      60             :  * Serializes oom killer invocations (out_of_memory()) from all contexts to
      61             :  * prevent from over eager oom killing (e.g. when the oom killer is invoked
      62             :  * from different domains).
      63             :  *
      64             :  * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
      65             :  * and mark_oom_victim
      66             :  */
      67             : DEFINE_MUTEX(oom_lock);
      68             : /* Serializes oom_score_adj and oom_score_adj_min updates */
      69             : DEFINE_MUTEX(oom_adj_mutex);
      70             : 
      71             : static inline bool is_memcg_oom(struct oom_control *oc)
      72             : {
      73             :         return oc->memcg != NULL;
      74             : }
      75             : 
      76             : #ifdef CONFIG_NUMA
      77             : /**
      78             :  * oom_cpuset_eligible() - check task eligibility for kill
      79             :  * @start: task struct of which task to consider
      80             :  * @oc: pointer to struct oom_control
      81             :  *
      82             :  * Task eligibility is determined by whether or not a candidate task, @tsk,
      83             :  * shares the same mempolicy nodes as current if it is bound by such a policy
      84             :  * and whether or not it has the same set of allowed cpuset nodes.
      85             :  *
      86             :  * This function is assuming oom-killer context and 'current' has triggered
      87             :  * the oom-killer.
      88             :  */
      89             : static bool oom_cpuset_eligible(struct task_struct *start,
      90             :                                 struct oom_control *oc)
      91             : {
      92             :         struct task_struct *tsk;
      93             :         bool ret = false;
      94             :         const nodemask_t *mask = oc->nodemask;
      95             : 
      96             :         rcu_read_lock();
      97             :         for_each_thread(start, tsk) {
      98             :                 if (mask) {
      99             :                         /*
     100             :                          * If this is a mempolicy constrained oom, tsk's
     101             :                          * cpuset is irrelevant.  Only return true if its
     102             :                          * mempolicy intersects current, otherwise it may be
     103             :                          * needlessly killed.
     104             :                          */
     105             :                         ret = mempolicy_in_oom_domain(tsk, mask);
     106             :                 } else {
     107             :                         /*
     108             :                          * This is not a mempolicy constrained oom, so only
     109             :                          * check the mems of tsk's cpuset.
     110             :                          */
     111             :                         ret = cpuset_mems_allowed_intersects(current, tsk);
     112             :                 }
     113             :                 if (ret)
     114             :                         break;
     115             :         }
     116             :         rcu_read_unlock();
     117             : 
     118             :         return ret;
     119             : }
     120             : #else
     121             : static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
     122             : {
     123             :         return true;
     124             : }
     125             : #endif /* CONFIG_NUMA */
     126             : 
     127             : /*
     128             :  * The process p may have detached its own ->mm while exiting or through
     129             :  * kthread_use_mm(), but one or more of its subthreads may still have a valid
     130             :  * pointer.  Return p, or any of its subthreads with a valid ->mm, with
     131             :  * task_lock() held.
     132             :  */
     133          14 : struct task_struct *find_lock_task_mm(struct task_struct *p)
     134             : {
     135             :         struct task_struct *t;
     136             : 
     137             :         rcu_read_lock();
     138             : 
     139          28 :         for_each_thread(p, t) {
     140          14 :                 task_lock(t);
     141          14 :                 if (likely(t->mm))
     142             :                         goto found;
     143          14 :                 task_unlock(t);
     144             :         }
     145             :         t = NULL;
     146             : found:
     147             :         rcu_read_unlock();
     148             : 
     149          14 :         return t;
     150             : }
     151             : 
     152             : /*
     153             :  * order == -1 means the oom kill is required by sysrq, otherwise only
     154             :  * for display purposes.
     155             :  */
     156             : static inline bool is_sysrq_oom(struct oom_control *oc)
     157             : {
     158             :         return oc->order == -1;
     159             : }
     160             : 
     161             : /* return true if the task is not adequate as candidate victim task. */
     162             : static bool oom_unkillable_task(struct task_struct *p)
     163             : {
     164           0 :         if (is_global_init(p))
     165             :                 return true;
     166           0 :         if (p->flags & PF_KTHREAD)
     167             :                 return true;
     168             :         return false;
     169             : }
     170             : 
     171             : /*
     172             :  * Check whether unreclaimable slab amount is greater than
     173             :  * all user memory(LRU pages).
     174             :  * dump_unreclaimable_slab() could help in the case that
     175             :  * oom due to too much unreclaimable slab used by kernel.
     176             : */
     177             : static bool should_dump_unreclaim_slab(void)
     178             : {
     179             :         unsigned long nr_lru;
     180             : 
     181           0 :         nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
     182           0 :                  global_node_page_state(NR_INACTIVE_ANON) +
     183           0 :                  global_node_page_state(NR_ACTIVE_FILE) +
     184           0 :                  global_node_page_state(NR_INACTIVE_FILE) +
     185           0 :                  global_node_page_state(NR_ISOLATED_ANON) +
     186           0 :                  global_node_page_state(NR_ISOLATED_FILE) +
     187           0 :                  global_node_page_state(NR_UNEVICTABLE);
     188             : 
     189           0 :         return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
     190             : }
     191             : 
     192             : /**
     193             :  * oom_badness - heuristic function to determine which candidate task to kill
     194             :  * @p: task struct of which task we should calculate
     195             :  * @totalpages: total present RAM allowed for page allocation
     196             :  *
     197             :  * The heuristic for determining which task to kill is made to be as simple and
     198             :  * predictable as possible.  The goal is to return the highest value for the
     199             :  * task consuming the most memory to avoid subsequent oom failures.
     200             :  */
     201           0 : long oom_badness(struct task_struct *p, unsigned long totalpages)
     202             : {
     203             :         long points;
     204             :         long adj;
     205             : 
     206           0 :         if (oom_unkillable_task(p))
     207             :                 return LONG_MIN;
     208             : 
     209           0 :         p = find_lock_task_mm(p);
     210           0 :         if (!p)
     211             :                 return LONG_MIN;
     212             : 
     213             :         /*
     214             :          * Do not even consider tasks which are explicitly marked oom
     215             :          * unkillable or have been already oom reaped or the are in
     216             :          * the middle of vfork
     217             :          */
     218           0 :         adj = (long)p->signal->oom_score_adj;
     219           0 :         if (adj == OOM_SCORE_ADJ_MIN ||
     220           0 :                         test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
     221           0 :                         in_vfork(p)) {
     222             :                 task_unlock(p);
     223           0 :                 return LONG_MIN;
     224             :         }
     225             : 
     226             :         /*
     227             :          * The baseline for the badness score is the proportion of RAM that each
     228             :          * task's rss, pagetable and swap space use.
     229             :          */
     230           0 :         points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
     231           0 :                 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
     232             :         task_unlock(p);
     233             : 
     234             :         /* Normalize to oom_score_adj units */
     235           0 :         adj *= totalpages / 1000;
     236           0 :         points += adj;
     237             : 
     238           0 :         return points;
     239             : }
     240             : 
     241             : static const char * const oom_constraint_text[] = {
     242             :         [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
     243             :         [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
     244             :         [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
     245             :         [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
     246             : };
     247             : 
     248             : /*
     249             :  * Determine the type of allocation constraint.
     250             :  */
     251             : static enum oom_constraint constrained_alloc(struct oom_control *oc)
     252             : {
     253             :         struct zone *zone;
     254             :         struct zoneref *z;
     255           0 :         enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
     256           0 :         bool cpuset_limited = false;
     257             :         int nid;
     258             : 
     259           0 :         if (is_memcg_oom(oc)) {
     260           0 :                 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
     261             :                 return CONSTRAINT_MEMCG;
     262             :         }
     263             : 
     264             :         /* Default to all available memory */
     265           0 :         oc->totalpages = totalram_pages() + total_swap_pages;
     266             : 
     267             :         if (!IS_ENABLED(CONFIG_NUMA))
     268             :                 return CONSTRAINT_NONE;
     269             : 
     270             :         if (!oc->zonelist)
     271             :                 return CONSTRAINT_NONE;
     272             :         /*
     273             :          * Reach here only when __GFP_NOFAIL is used. So, we should avoid
     274             :          * to kill current.We have to random task kill in this case.
     275             :          * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
     276             :          */
     277             :         if (oc->gfp_mask & __GFP_THISNODE)
     278             :                 return CONSTRAINT_NONE;
     279             : 
     280             :         /*
     281             :          * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
     282             :          * the page allocator means a mempolicy is in effect.  Cpuset policy
     283             :          * is enforced in get_page_from_freelist().
     284             :          */
     285             :         if (oc->nodemask &&
     286             :             !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
     287             :                 oc->totalpages = total_swap_pages;
     288             :                 for_each_node_mask(nid, *oc->nodemask)
     289             :                         oc->totalpages += node_present_pages(nid);
     290             :                 return CONSTRAINT_MEMORY_POLICY;
     291             :         }
     292             : 
     293             :         /* Check this allocation failure is caused by cpuset's wall function */
     294             :         for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
     295             :                         highest_zoneidx, oc->nodemask)
     296             :                 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
     297             :                         cpuset_limited = true;
     298             : 
     299             :         if (cpuset_limited) {
     300             :                 oc->totalpages = total_swap_pages;
     301             :                 for_each_node_mask(nid, cpuset_current_mems_allowed)
     302             :                         oc->totalpages += node_present_pages(nid);
     303             :                 return CONSTRAINT_CPUSET;
     304             :         }
     305             :         return CONSTRAINT_NONE;
     306             : }
     307             : 
     308           0 : static int oom_evaluate_task(struct task_struct *task, void *arg)
     309             : {
     310           0 :         struct oom_control *oc = arg;
     311             :         long points;
     312             : 
     313           0 :         if (oom_unkillable_task(task))
     314             :                 goto next;
     315             : 
     316             :         /* p may not have freeable memory in nodemask */
     317           0 :         if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
     318             :                 goto next;
     319             : 
     320             :         /*
     321             :          * This task already has access to memory reserves and is being killed.
     322             :          * Don't allow any other task to have access to the reserves unless
     323             :          * the task has MMF_OOM_SKIP because chances that it would release
     324             :          * any memory is quite low.
     325             :          */
     326           0 :         if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
     327           0 :                 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
     328             :                         goto next;
     329             :                 goto abort;
     330             :         }
     331             : 
     332             :         /*
     333             :          * If task is allocating a lot of memory and has been marked to be
     334             :          * killed first if it triggers an oom, then select it.
     335             :          */
     336           0 :         if (oom_task_origin(task)) {
     337             :                 points = LONG_MAX;
     338             :                 goto select;
     339             :         }
     340             : 
     341           0 :         points = oom_badness(task, oc->totalpages);
     342           0 :         if (points == LONG_MIN || points < oc->chosen_points)
     343             :                 goto next;
     344             : 
     345             : select:
     346           0 :         if (oc->chosen)
     347           0 :                 put_task_struct(oc->chosen);
     348           0 :         get_task_struct(task);
     349           0 :         oc->chosen = task;
     350           0 :         oc->chosen_points = points;
     351             : next:
     352             :         return 0;
     353             : abort:
     354           0 :         if (oc->chosen)
     355           0 :                 put_task_struct(oc->chosen);
     356           0 :         oc->chosen = (void *)-1UL;
     357           0 :         return 1;
     358             : }
     359             : 
     360             : /*
     361             :  * Simple selection loop. We choose the process with the highest number of
     362             :  * 'points'. In case scan was aborted, oc->chosen is set to -1.
     363             :  */
     364           0 : static void select_bad_process(struct oom_control *oc)
     365             : {
     366           0 :         oc->chosen_points = LONG_MIN;
     367             : 
     368           0 :         if (is_memcg_oom(oc))
     369             :                 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
     370             :         else {
     371             :                 struct task_struct *p;
     372             : 
     373             :                 rcu_read_lock();
     374           0 :                 for_each_process(p)
     375           0 :                         if (oom_evaluate_task(p, oc))
     376             :                                 break;
     377             :                 rcu_read_unlock();
     378             :         }
     379           0 : }
     380             : 
     381           0 : static int dump_task(struct task_struct *p, void *arg)
     382             : {
     383           0 :         struct oom_control *oc = arg;
     384             :         struct task_struct *task;
     385             : 
     386           0 :         if (oom_unkillable_task(p))
     387             :                 return 0;
     388             : 
     389             :         /* p may not have freeable memory in nodemask */
     390           0 :         if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
     391             :                 return 0;
     392             : 
     393           0 :         task = find_lock_task_mm(p);
     394           0 :         if (!task) {
     395             :                 /*
     396             :                  * All of p's threads have already detached their mm's. There's
     397             :                  * no need to report them; they can't be oom killed anyway.
     398             :                  */
     399             :                 return 0;
     400             :         }
     401             : 
     402           0 :         pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
     403             :                 task->pid, from_kuid(&init_user_ns, task_uid(task)),
     404             :                 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
     405             :                 mm_pgtables_bytes(task->mm),
     406             :                 get_mm_counter(task->mm, MM_SWAPENTS),
     407             :                 task->signal->oom_score_adj, task->comm);
     408             :         task_unlock(task);
     409             : 
     410           0 :         return 0;
     411             : }
     412             : 
     413             : /**
     414             :  * dump_tasks - dump current memory state of all system tasks
     415             :  * @oc: pointer to struct oom_control
     416             :  *
     417             :  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
     418             :  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
     419             :  * are not shown.
     420             :  * State information includes task's pid, uid, tgid, vm size, rss,
     421             :  * pgtables_bytes, swapents, oom_score_adj value, and name.
     422             :  */
     423           0 : static void dump_tasks(struct oom_control *oc)
     424             : {
     425           0 :         pr_info("Tasks state (memory values in pages):\n");
     426           0 :         pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
     427             : 
     428           0 :         if (is_memcg_oom(oc))
     429             :                 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
     430             :         else {
     431             :                 struct task_struct *p;
     432             : 
     433             :                 rcu_read_lock();
     434           0 :                 for_each_process(p)
     435           0 :                         dump_task(p, oc);
     436             :                 rcu_read_unlock();
     437             :         }
     438           0 : }
     439             : 
     440           0 : static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
     441             : {
     442             :         /* one line summary of the oom killer context. */
     443           0 :         pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
     444             :                         oom_constraint_text[oc->constraint],
     445             :                         nodemask_pr_args(oc->nodemask));
     446             :         cpuset_print_current_mems_allowed();
     447           0 :         mem_cgroup_print_oom_context(oc->memcg, victim);
     448           0 :         pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
     449             :                 from_kuid(&init_user_ns, task_uid(victim)));
     450           0 : }
     451             : 
     452           0 : static void dump_header(struct oom_control *oc, struct task_struct *p)
     453             : {
     454           0 :         pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
     455             :                 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
     456             :                         current->signal->oom_score_adj);
     457             :         if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
     458             :                 pr_warn("COMPACTION is disabled!!!\n");
     459             : 
     460           0 :         dump_stack();
     461           0 :         if (is_memcg_oom(oc))
     462             :                 mem_cgroup_print_oom_meminfo(oc->memcg);
     463             :         else {
     464           0 :                 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
     465           0 :                 if (should_dump_unreclaim_slab())
     466           0 :                         dump_unreclaimable_slab();
     467             :         }
     468           0 :         if (sysctl_oom_dump_tasks)
     469           0 :                 dump_tasks(oc);
     470           0 :         if (p)
     471           0 :                 dump_oom_summary(oc, p);
     472           0 : }
     473             : 
     474             : /*
     475             :  * Number of OOM victims in flight
     476             :  */
     477             : static atomic_t oom_victims = ATOMIC_INIT(0);
     478             : static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
     479             : 
     480             : static bool oom_killer_disabled __read_mostly;
     481             : 
     482             : #define K(x) ((x) << (PAGE_SHIFT-10))
     483             : 
     484             : /*
     485             :  * task->mm can be NULL if the task is the exited group leader.  So to
     486             :  * determine whether the task is using a particular mm, we examine all the
     487             :  * task's threads: if one of those is using this mm then this task was also
     488             :  * using it.
     489             :  */
     490           0 : bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
     491             : {
     492             :         struct task_struct *t;
     493             : 
     494           0 :         for_each_thread(p, t) {
     495           0 :                 struct mm_struct *t_mm = READ_ONCE(t->mm);
     496           0 :                 if (t_mm)
     497           0 :                         return t_mm == mm;
     498             :         }
     499             :         return false;
     500             : }
     501             : 
     502             : #ifdef CONFIG_MMU
     503             : /*
     504             :  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
     505             :  * victim (if that is possible) to help the OOM killer to move on.
     506             :  */
     507             : static struct task_struct *oom_reaper_th;
     508             : static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
     509             : static struct task_struct *oom_reaper_list;
     510             : static DEFINE_SPINLOCK(oom_reaper_lock);
     511             : 
     512           0 : bool __oom_reap_task_mm(struct mm_struct *mm)
     513             : {
     514             :         struct vm_area_struct *vma;
     515           0 :         bool ret = true;
     516             : 
     517             :         /*
     518             :          * Tell all users of get_user/copy_from_user etc... that the content
     519             :          * is no longer stable. No barriers really needed because unmapping
     520             :          * should imply barriers already and the reader would hit a page fault
     521             :          * if it stumbled over a reaped memory.
     522             :          */
     523           0 :         set_bit(MMF_UNSTABLE, &mm->flags);
     524             : 
     525           0 :         for (vma = mm->mmap ; vma; vma = vma->vm_next) {
     526           0 :                 if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
     527           0 :                         continue;
     528             : 
     529             :                 /*
     530             :                  * Only anonymous pages have a good chance to be dropped
     531             :                  * without additional steps which we cannot afford as we
     532             :                  * are OOM already.
     533             :                  *
     534             :                  * We do not even care about fs backed pages because all
     535             :                  * which are reclaimable have already been reclaimed and
     536             :                  * we do not want to block exit_mmap by keeping mm ref
     537             :                  * count elevated without a good reason.
     538             :                  */
     539           0 :                 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
     540             :                         struct mmu_notifier_range range;
     541             :                         struct mmu_gather tlb;
     542             : 
     543           0 :                         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
     544             :                                                 vma, mm, vma->vm_start,
     545             :                                                 vma->vm_end);
     546           0 :                         tlb_gather_mmu(&tlb, mm);
     547           0 :                         if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
     548             :                                 tlb_finish_mmu(&tlb);
     549             :                                 ret = false;
     550             :                                 continue;
     551             :                         }
     552           0 :                         unmap_page_range(&tlb, vma, range.start, range.end, NULL);
     553           0 :                         mmu_notifier_invalidate_range_end(&range);
     554           0 :                         tlb_finish_mmu(&tlb);
     555             :                 }
     556             :         }
     557             : 
     558           0 :         return ret;
     559             : }
     560             : 
     561             : /*
     562             :  * Reaps the address space of the give task.
     563             :  *
     564             :  * Returns true on success and false if none or part of the address space
     565             :  * has been reclaimed and the caller should retry later.
     566             :  */
     567           0 : static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
     568             : {
     569           0 :         bool ret = true;
     570             : 
     571           0 :         if (!mmap_read_trylock(mm)) {
     572             :                 trace_skip_task_reaping(tsk->pid);
     573             :                 return false;
     574             :         }
     575             : 
     576             :         /*
     577             :          * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
     578             :          * work on the mm anymore. The check for MMF_OOM_SKIP must run
     579             :          * under mmap_lock for reading because it serializes against the
     580             :          * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
     581             :          */
     582           0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
     583             :                 trace_skip_task_reaping(tsk->pid);
     584             :                 goto out_unlock;
     585             :         }
     586             : 
     587           0 :         trace_start_task_reaping(tsk->pid);
     588             : 
     589             :         /* failed to reap part of the address space. Try again later */
     590           0 :         ret = __oom_reap_task_mm(mm);
     591           0 :         if (!ret)
     592             :                 goto out_finish;
     593             : 
     594           0 :         pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
     595             :                         task_pid_nr(tsk), tsk->comm,
     596             :                         K(get_mm_counter(mm, MM_ANONPAGES)),
     597             :                         K(get_mm_counter(mm, MM_FILEPAGES)),
     598             :                         K(get_mm_counter(mm, MM_SHMEMPAGES)));
     599             : out_finish:
     600           0 :         trace_finish_task_reaping(tsk->pid);
     601             : out_unlock:
     602           0 :         mmap_read_unlock(mm);
     603             : 
     604           0 :         return ret;
     605             : }
     606             : 
     607             : #define MAX_OOM_REAP_RETRIES 10
     608           0 : static void oom_reap_task(struct task_struct *tsk)
     609             : {
     610           0 :         int attempts = 0;
     611           0 :         struct mm_struct *mm = tsk->signal->oom_mm;
     612             : 
     613             :         /* Retry the mmap_read_trylock(mm) a few times */
     614           0 :         while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
     615           0 :                 schedule_timeout_idle(HZ/10);
     616             : 
     617           0 :         if (attempts <= MAX_OOM_REAP_RETRIES ||
     618           0 :             test_bit(MMF_OOM_SKIP, &mm->flags))
     619             :                 goto done;
     620             : 
     621           0 :         pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
     622             :                 task_pid_nr(tsk), tsk->comm);
     623           0 :         sched_show_task(tsk);
     624             :         debug_show_all_locks();
     625             : 
     626             : done:
     627           0 :         tsk->oom_reaper_list = NULL;
     628             : 
     629             :         /*
     630             :          * Hide this mm from OOM killer because it has been either reaped or
     631             :          * somebody can't call mmap_write_unlock(mm).
     632             :          */
     633           0 :         set_bit(MMF_OOM_SKIP, &mm->flags);
     634             : 
     635             :         /* Drop a reference taken by queue_oom_reaper */
     636           0 :         put_task_struct(tsk);
     637           0 : }
     638             : 
     639           1 : static int oom_reaper(void *unused)
     640             : {
     641           1 :         set_freezable();
     642             : 
     643             :         while (true) {
     644           1 :                 struct task_struct *tsk = NULL;
     645             : 
     646           1 :                 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
     647           0 :                 spin_lock_irq(&oom_reaper_lock);
     648           0 :                 if (oom_reaper_list != NULL) {
     649           0 :                         tsk = oom_reaper_list;
     650           0 :                         oom_reaper_list = tsk->oom_reaper_list;
     651             :                 }
     652           0 :                 spin_unlock_irq(&oom_reaper_lock);
     653             : 
     654           0 :                 if (tsk)
     655           0 :                         oom_reap_task(tsk);
     656             :         }
     657             : 
     658             :         return 0;
     659             : }
     660             : 
     661           0 : static void wake_oom_reaper(struct timer_list *timer)
     662             : {
     663           0 :         struct task_struct *tsk = container_of(timer, struct task_struct,
     664             :                         oom_reaper_timer);
     665           0 :         struct mm_struct *mm = tsk->signal->oom_mm;
     666             :         unsigned long flags;
     667             : 
     668             :         /* The victim managed to terminate on its own - see exit_mmap */
     669           0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
     670           0 :                 put_task_struct(tsk);
     671           0 :                 return;
     672             :         }
     673             : 
     674           0 :         spin_lock_irqsave(&oom_reaper_lock, flags);
     675           0 :         tsk->oom_reaper_list = oom_reaper_list;
     676           0 :         oom_reaper_list = tsk;
     677           0 :         spin_unlock_irqrestore(&oom_reaper_lock, flags);
     678           0 :         trace_wake_reaper(tsk->pid);
     679           0 :         wake_up(&oom_reaper_wait);
     680             : }
     681             : 
     682             : /*
     683             :  * Give the OOM victim time to exit naturally before invoking the oom_reaping.
     684             :  * The timers timeout is arbitrary... the longer it is, the longer the worst
     685             :  * case scenario for the OOM can take. If it is too small, the oom_reaper can
     686             :  * get in the way and release resources needed by the process exit path.
     687             :  * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
     688             :  * before the exit path is able to wake the futex waiters.
     689             :  */
     690             : #define OOM_REAPER_DELAY (2*HZ)
     691           0 : static void queue_oom_reaper(struct task_struct *tsk)
     692             : {
     693             :         /* mm is already queued? */
     694           0 :         if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
     695             :                 return;
     696             : 
     697           0 :         get_task_struct(tsk);
     698           0 :         timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
     699           0 :         tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
     700           0 :         add_timer(&tsk->oom_reaper_timer);
     701             : }
     702             : 
     703           1 : static int __init oom_init(void)
     704             : {
     705           2 :         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
     706           1 :         return 0;
     707             : }
     708             : subsys_initcall(oom_init)
     709             : #else
     710             : static inline void queue_oom_reaper(struct task_struct *tsk)
     711             : {
     712             : }
     713             : #endif /* CONFIG_MMU */
     714             : 
     715             : /**
     716             :  * mark_oom_victim - mark the given task as OOM victim
     717             :  * @tsk: task to mark
     718             :  *
     719             :  * Has to be called with oom_lock held and never after
     720             :  * oom has been disabled already.
     721             :  *
     722             :  * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
     723             :  * under task_lock or operate on the current).
     724             :  */
     725           0 : static void mark_oom_victim(struct task_struct *tsk)
     726             : {
     727           0 :         struct mm_struct *mm = tsk->mm;
     728             : 
     729           0 :         WARN_ON(oom_killer_disabled);
     730             :         /* OOM killer might race with memcg OOM */
     731           0 :         if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
     732             :                 return;
     733             : 
     734             :         /* oom_mm is bound to the signal struct life time. */
     735           0 :         if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
     736           0 :                 mmgrab(tsk->signal->oom_mm);
     737           0 :                 set_bit(MMF_OOM_VICTIM, &mm->flags);
     738             :         }
     739             : 
     740             :         /*
     741             :          * Make sure that the task is woken up from uninterruptible sleep
     742             :          * if it is frozen because OOM killer wouldn't be able to free
     743             :          * any memory and livelock. freezing_slow_path will tell the freezer
     744             :          * that TIF_MEMDIE tasks should be ignored.
     745             :          */
     746           0 :         __thaw_task(tsk);
     747           0 :         atomic_inc(&oom_victims);
     748           0 :         trace_mark_victim(tsk->pid);
     749             : }
     750             : 
     751             : /**
     752             :  * exit_oom_victim - note the exit of an OOM victim
     753             :  */
     754           0 : void exit_oom_victim(void)
     755             : {
     756           0 :         clear_thread_flag(TIF_MEMDIE);
     757             : 
     758           0 :         if (!atomic_dec_return(&oom_victims))
     759           0 :                 wake_up_all(&oom_victims_wait);
     760           0 : }
     761             : 
     762             : /**
     763             :  * oom_killer_enable - enable OOM killer
     764             :  */
     765           0 : void oom_killer_enable(void)
     766             : {
     767           0 :         oom_killer_disabled = false;
     768           0 :         pr_info("OOM killer enabled.\n");
     769           0 : }
     770             : 
     771             : /**
     772             :  * oom_killer_disable - disable OOM killer
     773             :  * @timeout: maximum timeout to wait for oom victims in jiffies
     774             :  *
     775             :  * Forces all page allocations to fail rather than trigger OOM killer.
     776             :  * Will block and wait until all OOM victims are killed or the given
     777             :  * timeout expires.
     778             :  *
     779             :  * The function cannot be called when there are runnable user tasks because
     780             :  * the userspace would see unexpected allocation failures as a result. Any
     781             :  * new usage of this function should be consulted with MM people.
     782             :  *
     783             :  * Returns true if successful and false if the OOM killer cannot be
     784             :  * disabled.
     785             :  */
     786           0 : bool oom_killer_disable(signed long timeout)
     787             : {
     788             :         signed long ret;
     789             : 
     790             :         /*
     791             :          * Make sure to not race with an ongoing OOM killer. Check that the
     792             :          * current is not killed (possibly due to sharing the victim's memory).
     793             :          */
     794           0 :         if (mutex_lock_killable(&oom_lock))
     795             :                 return false;
     796           0 :         oom_killer_disabled = true;
     797           0 :         mutex_unlock(&oom_lock);
     798             : 
     799           0 :         ret = wait_event_interruptible_timeout(oom_victims_wait,
     800             :                         !atomic_read(&oom_victims), timeout);
     801           0 :         if (ret <= 0) {
     802             :                 oom_killer_enable();
     803           0 :                 return false;
     804             :         }
     805           0 :         pr_info("OOM killer disabled.\n");
     806             : 
     807           0 :         return true;
     808             : }
     809             : 
     810             : static inline bool __task_will_free_mem(struct task_struct *task)
     811             : {
     812           0 :         struct signal_struct *sig = task->signal;
     813             : 
     814             :         /*
     815             :          * A coredumping process may sleep for an extended period in
     816             :          * coredump_task_exit(), so the oom killer cannot assume that
     817             :          * the process will promptly exit and release memory.
     818             :          */
     819           0 :         if (sig->core_state)
     820             :                 return false;
     821             : 
     822           0 :         if (sig->flags & SIGNAL_GROUP_EXIT)
     823             :                 return true;
     824             : 
     825           0 :         if (thread_group_empty(task) && (task->flags & PF_EXITING))
     826             :                 return true;
     827             : 
     828             :         return false;
     829             : }
     830             : 
     831             : /*
     832             :  * Checks whether the given task is dying or exiting and likely to
     833             :  * release its address space. This means that all threads and processes
     834             :  * sharing the same mm have to be killed or exiting.
     835             :  * Caller has to make sure that task->mm is stable (hold task_lock or
     836             :  * it operates on the current).
     837             :  */
     838           0 : static bool task_will_free_mem(struct task_struct *task)
     839             : {
     840           0 :         struct mm_struct *mm = task->mm;
     841             :         struct task_struct *p;
     842           0 :         bool ret = true;
     843             : 
     844             :         /*
     845             :          * Skip tasks without mm because it might have passed its exit_mm and
     846             :          * exit_oom_victim. oom_reaper could have rescued that but do not rely
     847             :          * on that for now. We can consider find_lock_task_mm in future.
     848             :          */
     849           0 :         if (!mm)
     850             :                 return false;
     851             : 
     852           0 :         if (!__task_will_free_mem(task))
     853             :                 return false;
     854             : 
     855             :         /*
     856             :          * This task has already been drained by the oom reaper so there are
     857             :          * only small chances it will free some more
     858             :          */
     859           0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags))
     860             :                 return false;
     861             : 
     862           0 :         if (atomic_read(&mm->mm_users) <= 1)
     863             :                 return true;
     864             : 
     865             :         /*
     866             :          * Make sure that all tasks which share the mm with the given tasks
     867             :          * are dying as well to make sure that a) nobody pins its mm and
     868             :          * b) the task is also reapable by the oom reaper.
     869             :          */
     870             :         rcu_read_lock();
     871           0 :         for_each_process(p) {
     872           0 :                 if (!process_shares_mm(p, mm))
     873           0 :                         continue;
     874           0 :                 if (same_thread_group(task, p))
     875           0 :                         continue;
     876           0 :                 ret = __task_will_free_mem(p);
     877           0 :                 if (!ret)
     878             :                         break;
     879             :         }
     880             :         rcu_read_unlock();
     881             : 
     882           0 :         return ret;
     883             : }
     884             : 
     885           0 : static void __oom_kill_process(struct task_struct *victim, const char *message)
     886             : {
     887             :         struct task_struct *p;
     888             :         struct mm_struct *mm;
     889           0 :         bool can_oom_reap = true;
     890             : 
     891           0 :         p = find_lock_task_mm(victim);
     892           0 :         if (!p) {
     893           0 :                 pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
     894             :                         message, task_pid_nr(victim), victim->comm);
     895           0 :                 put_task_struct(victim);
     896           0 :                 return;
     897           0 :         } else if (victim != p) {
     898           0 :                 get_task_struct(p);
     899           0 :                 put_task_struct(victim);
     900           0 :                 victim = p;
     901             :         }
     902             : 
     903             :         /* Get a reference to safely compare mm after task_unlock(victim) */
     904           0 :         mm = victim->mm;
     905           0 :         mmgrab(mm);
     906             : 
     907             :         /* Raise event before sending signal: task reaper must see this */
     908           0 :         count_vm_event(OOM_KILL);
     909           0 :         memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
     910             : 
     911             :         /*
     912             :          * We should send SIGKILL before granting access to memory reserves
     913             :          * in order to prevent the OOM victim from depleting the memory
     914             :          * reserves from the user space under its control.
     915             :          */
     916           0 :         do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
     917           0 :         mark_oom_victim(victim);
     918           0 :         pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
     919             :                 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
     920             :                 K(get_mm_counter(mm, MM_ANONPAGES)),
     921             :                 K(get_mm_counter(mm, MM_FILEPAGES)),
     922             :                 K(get_mm_counter(mm, MM_SHMEMPAGES)),
     923             :                 from_kuid(&init_user_ns, task_uid(victim)),
     924             :                 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
     925           0 :         task_unlock(victim);
     926             : 
     927             :         /*
     928             :          * Kill all user processes sharing victim->mm in other thread groups, if
     929             :          * any.  They don't get access to memory reserves, though, to avoid
     930             :          * depletion of all memory.  This prevents mm->mmap_lock livelock when an
     931             :          * oom killed thread cannot exit because it requires the semaphore and
     932             :          * its contended by another thread trying to allocate memory itself.
     933             :          * That thread will now get access to memory reserves since it has a
     934             :          * pending fatal signal.
     935             :          */
     936             :         rcu_read_lock();
     937           0 :         for_each_process(p) {
     938           0 :                 if (!process_shares_mm(p, mm))
     939           0 :                         continue;
     940           0 :                 if (same_thread_group(p, victim))
     941           0 :                         continue;
     942           0 :                 if (is_global_init(p)) {
     943           0 :                         can_oom_reap = false;
     944           0 :                         set_bit(MMF_OOM_SKIP, &mm->flags);
     945           0 :                         pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
     946             :                                         task_pid_nr(victim), victim->comm,
     947             :                                         task_pid_nr(p), p->comm);
     948           0 :                         continue;
     949             :                 }
     950             :                 /*
     951             :                  * No kthread_use_mm() user needs to read from the userspace so
     952             :                  * we are ok to reap it.
     953             :                  */
     954           0 :                 if (unlikely(p->flags & PF_KTHREAD))
     955           0 :                         continue;
     956           0 :                 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
     957             :         }
     958             :         rcu_read_unlock();
     959             : 
     960           0 :         if (can_oom_reap)
     961           0 :                 queue_oom_reaper(victim);
     962             : 
     963           0 :         mmdrop(mm);
     964           0 :         put_task_struct(victim);
     965             : }
     966             : #undef K
     967             : 
     968             : /*
     969             :  * Kill provided task unless it's secured by setting
     970             :  * oom_score_adj to OOM_SCORE_ADJ_MIN.
     971             :  */
     972             : static int oom_kill_memcg_member(struct task_struct *task, void *message)
     973             : {
     974             :         if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
     975             :             !is_global_init(task)) {
     976             :                 get_task_struct(task);
     977             :                 __oom_kill_process(task, message);
     978             :         }
     979             :         return 0;
     980             : }
     981             : 
     982           0 : static void oom_kill_process(struct oom_control *oc, const char *message)
     983             : {
     984           0 :         struct task_struct *victim = oc->chosen;
     985             :         struct mem_cgroup *oom_group;
     986             :         static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
     987             :                                               DEFAULT_RATELIMIT_BURST);
     988             : 
     989             :         /*
     990             :          * If the task is already exiting, don't alarm the sysadmin or kill
     991             :          * its children or threads, just give it access to memory reserves
     992             :          * so it can die quickly
     993             :          */
     994           0 :         task_lock(victim);
     995           0 :         if (task_will_free_mem(victim)) {
     996           0 :                 mark_oom_victim(victim);
     997           0 :                 queue_oom_reaper(victim);
     998           0 :                 task_unlock(victim);
     999           0 :                 put_task_struct(victim);
    1000           0 :                 return;
    1001             :         }
    1002           0 :         task_unlock(victim);
    1003             : 
    1004           0 :         if (__ratelimit(&oom_rs))
    1005           0 :                 dump_header(oc, victim);
    1006             : 
    1007             :         /*
    1008             :          * Do we need to kill the entire memory cgroup?
    1009             :          * Or even one of the ancestor memory cgroups?
    1010             :          * Check this out before killing the victim task.
    1011             :          */
    1012           0 :         oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
    1013             : 
    1014           0 :         __oom_kill_process(victim, message);
    1015             : 
    1016             :         /*
    1017             :          * If necessary, kill all tasks in the selected memory cgroup.
    1018             :          */
    1019             :         if (oom_group) {
    1020             :                 memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
    1021             :                 mem_cgroup_print_oom_group(oom_group);
    1022             :                 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
    1023             :                                       (void *)message);
    1024             :                 mem_cgroup_put(oom_group);
    1025             :         }
    1026             : }
    1027             : 
    1028             : /*
    1029             :  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
    1030             :  */
    1031           0 : static void check_panic_on_oom(struct oom_control *oc)
    1032             : {
    1033           0 :         if (likely(!sysctl_panic_on_oom))
    1034             :                 return;
    1035           0 :         if (sysctl_panic_on_oom != 2) {
    1036             :                 /*
    1037             :                  * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
    1038             :                  * does not panic for cpuset, mempolicy, or memcg allocation
    1039             :                  * failures.
    1040             :                  */
    1041           0 :                 if (oc->constraint != CONSTRAINT_NONE)
    1042             :                         return;
    1043             :         }
    1044             :         /* Do not panic for oom kills triggered by sysrq */
    1045           0 :         if (is_sysrq_oom(oc))
    1046             :                 return;
    1047           0 :         dump_header(oc, NULL);
    1048           0 :         panic("Out of memory: %s panic_on_oom is enabled\n",
    1049           0 :                 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
    1050             : }
    1051             : 
    1052             : static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
    1053             : 
    1054           0 : int register_oom_notifier(struct notifier_block *nb)
    1055             : {
    1056           0 :         return blocking_notifier_chain_register(&oom_notify_list, nb);
    1057             : }
    1058             : EXPORT_SYMBOL_GPL(register_oom_notifier);
    1059             : 
    1060           0 : int unregister_oom_notifier(struct notifier_block *nb)
    1061             : {
    1062           0 :         return blocking_notifier_chain_unregister(&oom_notify_list, nb);
    1063             : }
    1064             : EXPORT_SYMBOL_GPL(unregister_oom_notifier);
    1065             : 
    1066             : /**
    1067             :  * out_of_memory - kill the "best" process when we run out of memory
    1068             :  * @oc: pointer to struct oom_control
    1069             :  *
    1070             :  * If we run out of memory, we have the choice between either
    1071             :  * killing a random task (bad), letting the system crash (worse)
    1072             :  * OR try to be smart about which process to kill. Note that we
    1073             :  * don't have to be perfect here, we just have to be good.
    1074             :  */
    1075           0 : bool out_of_memory(struct oom_control *oc)
    1076             : {
    1077           0 :         unsigned long freed = 0;
    1078             : 
    1079           0 :         if (oom_killer_disabled)
    1080             :                 return false;
    1081             : 
    1082           0 :         if (!is_memcg_oom(oc)) {
    1083           0 :                 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
    1084           0 :                 if (freed > 0 && !is_sysrq_oom(oc))
    1085             :                         /* Got some memory back in the last second. */
    1086             :                         return true;
    1087             :         }
    1088             : 
    1089             :         /*
    1090             :          * If current has a pending SIGKILL or is exiting, then automatically
    1091             :          * select it.  The goal is to allow it to allocate so that it may
    1092             :          * quickly exit and free its memory.
    1093             :          */
    1094           0 :         if (task_will_free_mem(current)) {
    1095           0 :                 mark_oom_victim(current);
    1096           0 :                 queue_oom_reaper(current);
    1097           0 :                 return true;
    1098             :         }
    1099             : 
    1100             :         /*
    1101             :          * The OOM killer does not compensate for IO-less reclaim.
    1102             :          * pagefault_out_of_memory lost its gfp context so we have to
    1103             :          * make sure exclude 0 mask - all other users should have at least
    1104             :          * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
    1105             :          * invoke the OOM killer even if it is a GFP_NOFS allocation.
    1106             :          */
    1107           0 :         if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
    1108             :                 return true;
    1109             : 
    1110             :         /*
    1111             :          * Check if there were limitations on the allocation (only relevant for
    1112             :          * NUMA and memcg) that may require different handling.
    1113             :          */
    1114           0 :         oc->constraint = constrained_alloc(oc);
    1115           0 :         if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
    1116           0 :                 oc->nodemask = NULL;
    1117           0 :         check_panic_on_oom(oc);
    1118             : 
    1119           0 :         if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
    1120           0 :             current->mm && !oom_unkillable_task(current) &&
    1121           0 :             oom_cpuset_eligible(current, oc) &&
    1122           0 :             current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
    1123           0 :                 get_task_struct(current);
    1124           0 :                 oc->chosen = current;
    1125           0 :                 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
    1126           0 :                 return true;
    1127             :         }
    1128             : 
    1129           0 :         select_bad_process(oc);
    1130             :         /* Found nothing?!?! */
    1131           0 :         if (!oc->chosen) {
    1132           0 :                 dump_header(oc, NULL);
    1133           0 :                 pr_warn("Out of memory and no killable processes...\n");
    1134             :                 /*
    1135             :                  * If we got here due to an actual allocation at the
    1136             :                  * system level, we cannot survive this and will enter
    1137             :                  * an endless loop in the allocator. Bail out now.
    1138             :                  */
    1139           0 :                 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
    1140           0 :                         panic("System is deadlocked on memory\n");
    1141             :         }
    1142           0 :         if (oc->chosen && oc->chosen != (void *)-1UL)
    1143           0 :                 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
    1144             :                                  "Memory cgroup out of memory");
    1145           0 :         return !!oc->chosen;
    1146             : }
    1147             : 
    1148             : /*
    1149             :  * The pagefault handler calls here because some allocation has failed. We have
    1150             :  * to take care of the memcg OOM here because this is the only safe context without
    1151             :  * any locks held but let the oom killer triggered from the allocation context care
    1152             :  * about the global OOM.
    1153             :  */
    1154           0 : void pagefault_out_of_memory(void)
    1155             : {
    1156             :         static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
    1157             :                                       DEFAULT_RATELIMIT_BURST);
    1158             : 
    1159           0 :         if (mem_cgroup_oom_synchronize(true))
    1160             :                 return;
    1161             : 
    1162           0 :         if (fatal_signal_pending(current))
    1163             :                 return;
    1164             : 
    1165           0 :         if (__ratelimit(&pfoom_rs))
    1166           0 :                 pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
    1167             : }
    1168             : 
    1169           0 : SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
    1170             : {
    1171             : #ifdef CONFIG_MMU
    1172           0 :         struct mm_struct *mm = NULL;
    1173             :         struct task_struct *task;
    1174             :         struct task_struct *p;
    1175             :         unsigned int f_flags;
    1176           0 :         bool reap = false;
    1177           0 :         long ret = 0;
    1178             : 
    1179           0 :         if (flags)
    1180             :                 return -EINVAL;
    1181             : 
    1182           0 :         task = pidfd_get_task(pidfd, &f_flags);
    1183           0 :         if (IS_ERR(task))
    1184           0 :                 return PTR_ERR(task);
    1185             : 
    1186             :         /*
    1187             :          * Make sure to choose a thread which still has a reference to mm
    1188             :          * during the group exit
    1189             :          */
    1190           0 :         p = find_lock_task_mm(task);
    1191           0 :         if (!p) {
    1192             :                 ret = -ESRCH;
    1193             :                 goto put_task;
    1194             :         }
    1195             : 
    1196           0 :         mm = p->mm;
    1197           0 :         mmgrab(mm);
    1198             : 
    1199           0 :         if (task_will_free_mem(p))
    1200             :                 reap = true;
    1201             :         else {
    1202             :                 /* Error only if the work has not been done already */
    1203           0 :                 if (!test_bit(MMF_OOM_SKIP, &mm->flags))
    1204           0 :                         ret = -EINVAL;
    1205             :         }
    1206             :         task_unlock(p);
    1207             : 
    1208           0 :         if (!reap)
    1209             :                 goto drop_mm;
    1210             : 
    1211           0 :         if (mmap_read_lock_killable(mm)) {
    1212             :                 ret = -EINTR;
    1213             :                 goto drop_mm;
    1214             :         }
    1215             :         /*
    1216             :          * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
    1217             :          * possible change in exit_mmap is seen
    1218             :          */
    1219           0 :         if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
    1220           0 :                 ret = -EAGAIN;
    1221             :         mmap_read_unlock(mm);
    1222             : 
    1223             : drop_mm:
    1224             :         mmdrop(mm);
    1225             : put_task:
    1226           0 :         put_task_struct(task);
    1227           0 :         return ret;
    1228             : #else
    1229             :         return -ENOSYS;
    1230             : #endif /* CONFIG_MMU */
    1231             : }

Generated by: LCOV version 1.14