Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : #include <linux/mm.h>
3 : #include <linux/slab.h>
4 : #include <linux/string.h>
5 : #include <linux/compiler.h>
6 : #include <linux/export.h>
7 : #include <linux/err.h>
8 : #include <linux/sched.h>
9 : #include <linux/sched/mm.h>
10 : #include <linux/sched/signal.h>
11 : #include <linux/sched/task_stack.h>
12 : #include <linux/security.h>
13 : #include <linux/swap.h>
14 : #include <linux/swapops.h>
15 : #include <linux/mman.h>
16 : #include <linux/hugetlb.h>
17 : #include <linux/vmalloc.h>
18 : #include <linux/userfaultfd_k.h>
19 : #include <linux/elf.h>
20 : #include <linux/elf-randomize.h>
21 : #include <linux/personality.h>
22 : #include <linux/random.h>
23 : #include <linux/processor.h>
24 : #include <linux/sizes.h>
25 : #include <linux/compat.h>
26 :
27 : #include <linux/uaccess.h>
28 :
29 : #include "internal.h"
30 :
31 : /**
32 : * kfree_const - conditionally free memory
33 : * @x: pointer to the memory
34 : *
35 : * Function calls kfree only if @x is not in .rodata section.
36 : */
37 3040 : void kfree_const(const void *x)
38 : {
39 6080 : if (!is_kernel_rodata((unsigned long)x))
40 714 : kfree(x);
41 3040 : }
42 : EXPORT_SYMBOL(kfree_const);
43 :
44 : /**
45 : * kstrdup - allocate space for and copy an existing string
46 : * @s: the string to duplicate
47 : * @gfp: the GFP mask used in the kmalloc() call when allocating memory
48 : *
49 : * Return: newly allocated copy of @s or %NULL in case of error
50 : */
51 2197 : char *kstrdup(const char *s, gfp_t gfp)
52 : {
53 : size_t len;
54 : char *buf;
55 :
56 2197 : if (!s)
57 : return NULL;
58 :
59 2195 : len = strlen(s) + 1;
60 2195 : buf = kmalloc_track_caller(len, gfp);
61 2195 : if (buf)
62 2195 : memcpy(buf, s, len);
63 : return buf;
64 : }
65 : EXPORT_SYMBOL(kstrdup);
66 :
67 : /**
68 : * kstrdup_const - conditionally duplicate an existing const string
69 : * @s: the string to duplicate
70 : * @gfp: the GFP mask used in the kmalloc() call when allocating memory
71 : *
72 : * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
73 : * must not be passed to krealloc().
74 : *
75 : * Return: source string if it is in .rodata section otherwise
76 : * fallback to kstrdup.
77 : */
78 11810 : const char *kstrdup_const(const char *s, gfp_t gfp)
79 : {
80 23620 : if (is_kernel_rodata((unsigned long)s))
81 : return s;
82 :
83 2178 : return kstrdup(s, gfp);
84 : }
85 : EXPORT_SYMBOL(kstrdup_const);
86 :
87 : /**
88 : * kstrndup - allocate space for and copy an existing string
89 : * @s: the string to duplicate
90 : * @max: read at most @max chars from @s
91 : * @gfp: the GFP mask used in the kmalloc() call when allocating memory
92 : *
93 : * Note: Use kmemdup_nul() instead if the size is known exactly.
94 : *
95 : * Return: newly allocated copy of @s or %NULL in case of error
96 : */
97 0 : char *kstrndup(const char *s, size_t max, gfp_t gfp)
98 : {
99 : size_t len;
100 : char *buf;
101 :
102 0 : if (!s)
103 : return NULL;
104 :
105 0 : len = strnlen(s, max);
106 0 : buf = kmalloc_track_caller(len+1, gfp);
107 0 : if (buf) {
108 0 : memcpy(buf, s, len);
109 0 : buf[len] = '\0';
110 : }
111 : return buf;
112 : }
113 : EXPORT_SYMBOL(kstrndup);
114 :
115 : /**
116 : * kmemdup - duplicate region of memory
117 : *
118 : * @src: memory region to duplicate
119 : * @len: memory region length
120 : * @gfp: GFP mask to use
121 : *
122 : * Return: newly allocated copy of @src or %NULL in case of error
123 : */
124 1 : void *kmemdup(const void *src, size_t len, gfp_t gfp)
125 : {
126 : void *p;
127 :
128 1 : p = kmalloc_track_caller(len, gfp);
129 1 : if (p)
130 1 : memcpy(p, src, len);
131 1 : return p;
132 : }
133 : EXPORT_SYMBOL(kmemdup);
134 :
135 : /**
136 : * kmemdup_nul - Create a NUL-terminated string from unterminated data
137 : * @s: The data to stringify
138 : * @len: The size of the data
139 : * @gfp: the GFP mask used in the kmalloc() call when allocating memory
140 : *
141 : * Return: newly allocated copy of @s with NUL-termination or %NULL in
142 : * case of error
143 : */
144 10 : char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
145 : {
146 : char *buf;
147 :
148 10 : if (!s)
149 : return NULL;
150 :
151 10 : buf = kmalloc_track_caller(len + 1, gfp);
152 10 : if (buf) {
153 10 : memcpy(buf, s, len);
154 10 : buf[len] = '\0';
155 : }
156 : return buf;
157 : }
158 : EXPORT_SYMBOL(kmemdup_nul);
159 :
160 : /**
161 : * memdup_user - duplicate memory region from user space
162 : *
163 : * @src: source address in user space
164 : * @len: number of bytes to copy
165 : *
166 : * Return: an ERR_PTR() on failure. Result is physically
167 : * contiguous, to be freed by kfree().
168 : */
169 0 : void *memdup_user(const void __user *src, size_t len)
170 : {
171 : void *p;
172 :
173 0 : p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
174 0 : if (!p)
175 : return ERR_PTR(-ENOMEM);
176 :
177 0 : if (copy_from_user(p, src, len)) {
178 0 : kfree(p);
179 0 : return ERR_PTR(-EFAULT);
180 : }
181 :
182 : return p;
183 : }
184 : EXPORT_SYMBOL(memdup_user);
185 :
186 : /**
187 : * vmemdup_user - duplicate memory region from user space
188 : *
189 : * @src: source address in user space
190 : * @len: number of bytes to copy
191 : *
192 : * Return: an ERR_PTR() on failure. Result may be not
193 : * physically contiguous. Use kvfree() to free.
194 : */
195 0 : void *vmemdup_user(const void __user *src, size_t len)
196 : {
197 : void *p;
198 :
199 0 : p = kvmalloc(len, GFP_USER);
200 0 : if (!p)
201 : return ERR_PTR(-ENOMEM);
202 :
203 0 : if (copy_from_user(p, src, len)) {
204 0 : kvfree(p);
205 0 : return ERR_PTR(-EFAULT);
206 : }
207 :
208 : return p;
209 : }
210 : EXPORT_SYMBOL(vmemdup_user);
211 :
212 : /**
213 : * strndup_user - duplicate an existing string from user space
214 : * @s: The string to duplicate
215 : * @n: Maximum number of bytes to copy, including the trailing NUL.
216 : *
217 : * Return: newly allocated copy of @s or an ERR_PTR() in case of error
218 : */
219 0 : char *strndup_user(const char __user *s, long n)
220 : {
221 : char *p;
222 : long length;
223 :
224 0 : length = strnlen_user(s, n);
225 :
226 0 : if (!length)
227 : return ERR_PTR(-EFAULT);
228 :
229 0 : if (length > n)
230 : return ERR_PTR(-EINVAL);
231 :
232 0 : p = memdup_user(s, length);
233 :
234 0 : if (IS_ERR(p))
235 : return p;
236 :
237 0 : p[length - 1] = '\0';
238 :
239 0 : return p;
240 : }
241 : EXPORT_SYMBOL(strndup_user);
242 :
243 : /**
244 : * memdup_user_nul - duplicate memory region from user space and NUL-terminate
245 : *
246 : * @src: source address in user space
247 : * @len: number of bytes to copy
248 : *
249 : * Return: an ERR_PTR() on failure.
250 : */
251 0 : void *memdup_user_nul(const void __user *src, size_t len)
252 : {
253 : char *p;
254 :
255 : /*
256 : * Always use GFP_KERNEL, since copy_from_user() can sleep and
257 : * cause pagefault, which makes it pointless to use GFP_NOFS
258 : * or GFP_ATOMIC.
259 : */
260 0 : p = kmalloc_track_caller(len + 1, GFP_KERNEL);
261 0 : if (!p)
262 : return ERR_PTR(-ENOMEM);
263 :
264 0 : if (copy_from_user(p, src, len)) {
265 0 : kfree(p);
266 0 : return ERR_PTR(-EFAULT);
267 : }
268 0 : p[len] = '\0';
269 :
270 0 : return p;
271 : }
272 : EXPORT_SYMBOL(memdup_user_nul);
273 :
274 0 : void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
275 : struct vm_area_struct *prev)
276 : {
277 : struct vm_area_struct *next;
278 :
279 0 : vma->vm_prev = prev;
280 0 : if (prev) {
281 0 : next = prev->vm_next;
282 0 : prev->vm_next = vma;
283 : } else {
284 0 : next = mm->mmap;
285 0 : mm->mmap = vma;
286 : }
287 0 : vma->vm_next = next;
288 0 : if (next)
289 0 : next->vm_prev = vma;
290 0 : }
291 :
292 0 : void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
293 : {
294 : struct vm_area_struct *prev, *next;
295 :
296 0 : next = vma->vm_next;
297 0 : prev = vma->vm_prev;
298 0 : if (prev)
299 0 : prev->vm_next = next;
300 : else
301 0 : mm->mmap = next;
302 0 : if (next)
303 0 : next->vm_prev = prev;
304 0 : }
305 :
306 : /* Check if the vma is being used as a stack by this task */
307 0 : int vma_is_stack_for_current(struct vm_area_struct *vma)
308 : {
309 0 : struct task_struct * __maybe_unused t = current;
310 :
311 0 : return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
312 : }
313 :
314 : /*
315 : * Change backing file, only valid to use during initial VMA setup.
316 : */
317 0 : void vma_set_file(struct vm_area_struct *vma, struct file *file)
318 : {
319 : /* Changing an anonymous vma with this is illegal */
320 0 : get_file(file);
321 0 : swap(vma->vm_file, file);
322 0 : fput(file);
323 0 : }
324 : EXPORT_SYMBOL(vma_set_file);
325 :
326 : #ifndef STACK_RND_MASK
327 : #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
328 : #endif
329 :
330 0 : unsigned long randomize_stack_top(unsigned long stack_top)
331 : {
332 0 : unsigned long random_variable = 0;
333 :
334 0 : if (current->flags & PF_RANDOMIZE) {
335 0 : random_variable = get_random_long();
336 0 : random_variable &= STACK_RND_MASK;
337 0 : random_variable <<= PAGE_SHIFT;
338 : }
339 : #ifdef CONFIG_STACK_GROWSUP
340 : return PAGE_ALIGN(stack_top) + random_variable;
341 : #else
342 0 : return PAGE_ALIGN(stack_top) - random_variable;
343 : #endif
344 : }
345 :
346 : #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
347 : unsigned long arch_randomize_brk(struct mm_struct *mm)
348 : {
349 : /* Is the current task 32bit ? */
350 : if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
351 : return randomize_page(mm->brk, SZ_32M);
352 :
353 : return randomize_page(mm->brk, SZ_1G);
354 : }
355 :
356 : unsigned long arch_mmap_rnd(void)
357 : {
358 : unsigned long rnd;
359 :
360 : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
361 : if (is_compat_task())
362 : rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
363 : else
364 : #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
365 : rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
366 :
367 : return rnd << PAGE_SHIFT;
368 : }
369 :
370 : static int mmap_is_legacy(struct rlimit *rlim_stack)
371 : {
372 : if (current->personality & ADDR_COMPAT_LAYOUT)
373 : return 1;
374 :
375 : if (rlim_stack->rlim_cur == RLIM_INFINITY)
376 : return 1;
377 :
378 : return sysctl_legacy_va_layout;
379 : }
380 :
381 : /*
382 : * Leave enough space between the mmap area and the stack to honour ulimit in
383 : * the face of randomisation.
384 : */
385 : #define MIN_GAP (SZ_128M)
386 : #define MAX_GAP (STACK_TOP / 6 * 5)
387 :
388 : static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
389 : {
390 : unsigned long gap = rlim_stack->rlim_cur;
391 : unsigned long pad = stack_guard_gap;
392 :
393 : /* Account for stack randomization if necessary */
394 : if (current->flags & PF_RANDOMIZE)
395 : pad += (STACK_RND_MASK << PAGE_SHIFT);
396 :
397 : /* Values close to RLIM_INFINITY can overflow. */
398 : if (gap + pad > gap)
399 : gap += pad;
400 :
401 : if (gap < MIN_GAP)
402 : gap = MIN_GAP;
403 : else if (gap > MAX_GAP)
404 : gap = MAX_GAP;
405 :
406 : return PAGE_ALIGN(STACK_TOP - gap - rnd);
407 : }
408 :
409 : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
410 : {
411 : unsigned long random_factor = 0UL;
412 :
413 : if (current->flags & PF_RANDOMIZE)
414 : random_factor = arch_mmap_rnd();
415 :
416 : if (mmap_is_legacy(rlim_stack)) {
417 : mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
418 : mm->get_unmapped_area = arch_get_unmapped_area;
419 : } else {
420 : mm->mmap_base = mmap_base(random_factor, rlim_stack);
421 : mm->get_unmapped_area = arch_get_unmapped_area_topdown;
422 : }
423 : }
424 : #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
425 0 : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
426 : {
427 0 : mm->mmap_base = TASK_UNMAPPED_BASE;
428 0 : mm->get_unmapped_area = arch_get_unmapped_area;
429 0 : }
430 : #endif
431 :
432 : /**
433 : * __account_locked_vm - account locked pages to an mm's locked_vm
434 : * @mm: mm to account against
435 : * @pages: number of pages to account
436 : * @inc: %true if @pages should be considered positive, %false if not
437 : * @task: task used to check RLIMIT_MEMLOCK
438 : * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
439 : *
440 : * Assumes @task and @mm are valid (i.e. at least one reference on each), and
441 : * that mmap_lock is held as writer.
442 : *
443 : * Return:
444 : * * 0 on success
445 : * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
446 : */
447 0 : int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
448 : struct task_struct *task, bool bypass_rlim)
449 : {
450 : unsigned long locked_vm, limit;
451 0 : int ret = 0;
452 :
453 0 : mmap_assert_write_locked(mm);
454 :
455 0 : locked_vm = mm->locked_vm;
456 0 : if (inc) {
457 0 : if (!bypass_rlim) {
458 0 : limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
459 0 : if (locked_vm + pages > limit)
460 0 : ret = -ENOMEM;
461 : }
462 0 : if (!ret)
463 0 : mm->locked_vm = locked_vm + pages;
464 : } else {
465 0 : WARN_ON_ONCE(pages > locked_vm);
466 0 : mm->locked_vm = locked_vm - pages;
467 : }
468 :
469 : pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
470 : (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
471 : locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
472 : ret ? " - exceeded" : "");
473 :
474 0 : return ret;
475 : }
476 : EXPORT_SYMBOL_GPL(__account_locked_vm);
477 :
478 : /**
479 : * account_locked_vm - account locked pages to an mm's locked_vm
480 : * @mm: mm to account against, may be NULL
481 : * @pages: number of pages to account
482 : * @inc: %true if @pages should be considered positive, %false if not
483 : *
484 : * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
485 : *
486 : * Return:
487 : * * 0 on success, or if mm is NULL
488 : * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
489 : */
490 0 : int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
491 : {
492 : int ret;
493 :
494 0 : if (pages == 0 || !mm)
495 : return 0;
496 :
497 0 : mmap_write_lock(mm);
498 0 : ret = __account_locked_vm(mm, pages, inc, current,
499 0 : capable(CAP_IPC_LOCK));
500 0 : mmap_write_unlock(mm);
501 :
502 0 : return ret;
503 : }
504 : EXPORT_SYMBOL_GPL(account_locked_vm);
505 :
506 0 : unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
507 : unsigned long len, unsigned long prot,
508 : unsigned long flag, unsigned long pgoff)
509 : {
510 : unsigned long ret;
511 0 : struct mm_struct *mm = current->mm;
512 : unsigned long populate;
513 0 : LIST_HEAD(uf);
514 :
515 0 : ret = security_mmap_file(file, prot, flag);
516 : if (!ret) {
517 0 : if (mmap_write_lock_killable(mm))
518 : return -EINTR;
519 0 : ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
520 : &uf);
521 0 : mmap_write_unlock(mm);
522 0 : userfaultfd_unmap_complete(mm, &uf);
523 0 : if (populate)
524 0 : mm_populate(ret, populate);
525 : }
526 : return ret;
527 : }
528 :
529 0 : unsigned long vm_mmap(struct file *file, unsigned long addr,
530 : unsigned long len, unsigned long prot,
531 : unsigned long flag, unsigned long offset)
532 : {
533 0 : if (unlikely(offset + PAGE_ALIGN(len) < offset))
534 : return -EINVAL;
535 0 : if (unlikely(offset_in_page(offset)))
536 : return -EINVAL;
537 :
538 0 : return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
539 : }
540 : EXPORT_SYMBOL(vm_mmap);
541 :
542 : /**
543 : * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
544 : * failure, fall back to non-contiguous (vmalloc) allocation.
545 : * @size: size of the request.
546 : * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
547 : * @node: numa node to allocate from
548 : *
549 : * Uses kmalloc to get the memory but if the allocation fails then falls back
550 : * to the vmalloc allocator. Use kvfree for freeing the memory.
551 : *
552 : * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
553 : * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
554 : * preferable to the vmalloc fallback, due to visible performance drawbacks.
555 : *
556 : * Return: pointer to the allocated memory of %NULL in case of failure
557 : */
558 0 : void *kvmalloc_node(size_t size, gfp_t flags, int node)
559 : {
560 0 : gfp_t kmalloc_flags = flags;
561 : void *ret;
562 :
563 : /*
564 : * We want to attempt a large physically contiguous block first because
565 : * it is less likely to fragment multiple larger blocks and therefore
566 : * contribute to a long term fragmentation less than vmalloc fallback.
567 : * However make sure that larger requests are not too disruptive - no
568 : * OOM killer and no allocation failure warnings as we have a fallback.
569 : */
570 0 : if (size > PAGE_SIZE) {
571 0 : kmalloc_flags |= __GFP_NOWARN;
572 :
573 0 : if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
574 0 : kmalloc_flags |= __GFP_NORETRY;
575 :
576 : /* nofail semantic is implemented by the vmalloc fallback */
577 0 : kmalloc_flags &= ~__GFP_NOFAIL;
578 : }
579 :
580 0 : ret = kmalloc_node(size, kmalloc_flags, node);
581 :
582 : /*
583 : * It doesn't really make sense to fallback to vmalloc for sub page
584 : * requests
585 : */
586 0 : if (ret || size <= PAGE_SIZE)
587 : return ret;
588 :
589 : /* Don't even allow crazy sizes */
590 0 : if (unlikely(size > INT_MAX)) {
591 0 : WARN_ON_ONCE(!(flags & __GFP_NOWARN));
592 : return NULL;
593 : }
594 :
595 : /*
596 : * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
597 : * since the callers already cannot assume anything
598 : * about the resulting pointer, and cannot play
599 : * protection games.
600 : */
601 0 : return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
602 0 : flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
603 0 : node, __builtin_return_address(0));
604 : }
605 : EXPORT_SYMBOL(kvmalloc_node);
606 :
607 : /**
608 : * kvfree() - Free memory.
609 : * @addr: Pointer to allocated memory.
610 : *
611 : * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
612 : * It is slightly more efficient to use kfree() or vfree() if you are certain
613 : * that you know which one to use.
614 : *
615 : * Context: Either preemptible task context or not-NMI interrupt.
616 : */
617 0 : void kvfree(const void *addr)
618 : {
619 0 : if (is_vmalloc_addr(addr))
620 0 : vfree(addr);
621 : else
622 0 : kfree(addr);
623 0 : }
624 : EXPORT_SYMBOL(kvfree);
625 :
626 : /**
627 : * kvfree_sensitive - Free a data object containing sensitive information.
628 : * @addr: address of the data object to be freed.
629 : * @len: length of the data object.
630 : *
631 : * Use the special memzero_explicit() function to clear the content of a
632 : * kvmalloc'ed object containing sensitive data to make sure that the
633 : * compiler won't optimize out the data clearing.
634 : */
635 0 : void kvfree_sensitive(const void *addr, size_t len)
636 : {
637 0 : if (likely(!ZERO_OR_NULL_PTR(addr))) {
638 0 : memzero_explicit((void *)addr, len);
639 0 : kvfree(addr);
640 : }
641 0 : }
642 : EXPORT_SYMBOL(kvfree_sensitive);
643 :
644 0 : void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
645 : {
646 : void *newp;
647 :
648 0 : if (oldsize >= newsize)
649 : return (void *)p;
650 0 : newp = kvmalloc(newsize, flags);
651 0 : if (!newp)
652 : return NULL;
653 0 : memcpy(newp, p, oldsize);
654 0 : kvfree(p);
655 0 : return newp;
656 : }
657 : EXPORT_SYMBOL(kvrealloc);
658 :
659 : /**
660 : * __vmalloc_array - allocate memory for a virtually contiguous array.
661 : * @n: number of elements.
662 : * @size: element size.
663 : * @flags: the type of memory to allocate (see kmalloc).
664 : */
665 0 : void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
666 : {
667 : size_t bytes;
668 :
669 0 : if (unlikely(check_mul_overflow(n, size, &bytes)))
670 : return NULL;
671 0 : return __vmalloc(bytes, flags);
672 : }
673 : EXPORT_SYMBOL(__vmalloc_array);
674 :
675 : /**
676 : * vmalloc_array - allocate memory for a virtually contiguous array.
677 : * @n: number of elements.
678 : * @size: element size.
679 : */
680 0 : void *vmalloc_array(size_t n, size_t size)
681 : {
682 0 : return __vmalloc_array(n, size, GFP_KERNEL);
683 : }
684 : EXPORT_SYMBOL(vmalloc_array);
685 :
686 : /**
687 : * __vcalloc - allocate and zero memory for a virtually contiguous array.
688 : * @n: number of elements.
689 : * @size: element size.
690 : * @flags: the type of memory to allocate (see kmalloc).
691 : */
692 0 : void *__vcalloc(size_t n, size_t size, gfp_t flags)
693 : {
694 0 : return __vmalloc_array(n, size, flags | __GFP_ZERO);
695 : }
696 : EXPORT_SYMBOL(__vcalloc);
697 :
698 : /**
699 : * vcalloc - allocate and zero memory for a virtually contiguous array.
700 : * @n: number of elements.
701 : * @size: element size.
702 : */
703 0 : void *vcalloc(size_t n, size_t size)
704 : {
705 0 : return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
706 : }
707 : EXPORT_SYMBOL(vcalloc);
708 :
709 : /* Neutral page->mapping pointer to address_space or anon_vma or other */
710 0 : void *page_rmapping(struct page *page)
711 : {
712 0 : return folio_raw_mapping(page_folio(page));
713 : }
714 :
715 : /**
716 : * folio_mapped - Is this folio mapped into userspace?
717 : * @folio: The folio.
718 : *
719 : * Return: True if any page in this folio is referenced by user page tables.
720 : */
721 0 : bool folio_mapped(struct folio *folio)
722 : {
723 : long i, nr;
724 :
725 0 : if (!folio_test_large(folio))
726 0 : return atomic_read(&folio->_mapcount) >= 0;
727 0 : if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
728 : return true;
729 0 : if (folio_test_hugetlb(folio))
730 : return false;
731 :
732 0 : nr = folio_nr_pages(folio);
733 0 : for (i = 0; i < nr; i++) {
734 0 : if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
735 : return true;
736 : }
737 : return false;
738 : }
739 : EXPORT_SYMBOL(folio_mapped);
740 :
741 0 : struct anon_vma *folio_anon_vma(struct folio *folio)
742 : {
743 0 : unsigned long mapping = (unsigned long)folio->mapping;
744 :
745 0 : if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
746 : return NULL;
747 0 : return (void *)(mapping - PAGE_MAPPING_ANON);
748 : }
749 :
750 : /**
751 : * folio_mapping - Find the mapping where this folio is stored.
752 : * @folio: The folio.
753 : *
754 : * For folios which are in the page cache, return the mapping that this
755 : * page belongs to. Folios in the swap cache return the swap mapping
756 : * this page is stored in (which is different from the mapping for the
757 : * swap file or swap device where the data is stored).
758 : *
759 : * You can call this for folios which aren't in the swap cache or page
760 : * cache and it will return NULL.
761 : */
762 0 : struct address_space *folio_mapping(struct folio *folio)
763 : {
764 : struct address_space *mapping;
765 :
766 : /* This happens if someone calls flush_dcache_page on slab page */
767 0 : if (unlikely(folio_test_slab(folio)))
768 : return NULL;
769 :
770 0 : if (unlikely(folio_test_swapcache(folio)))
771 0 : return swap_address_space(folio_swap_entry(folio));
772 :
773 0 : mapping = folio->mapping;
774 0 : if ((unsigned long)mapping & PAGE_MAPPING_ANON)
775 : return NULL;
776 :
777 0 : return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
778 : }
779 : EXPORT_SYMBOL(folio_mapping);
780 :
781 : /* Slow path of page_mapcount() for compound pages */
782 0 : int __page_mapcount(struct page *page)
783 : {
784 : int ret;
785 :
786 0 : ret = atomic_read(&page->_mapcount) + 1;
787 : /*
788 : * For file THP page->_mapcount contains total number of mapping
789 : * of the page: no need to look into compound_mapcount.
790 : */
791 0 : if (!PageAnon(page) && !PageHuge(page))
792 : return ret;
793 0 : page = compound_head(page);
794 0 : ret += atomic_read(compound_mapcount_ptr(page)) + 1;
795 0 : if (PageDoubleMap(page))
796 : ret--;
797 0 : return ret;
798 : }
799 : EXPORT_SYMBOL_GPL(__page_mapcount);
800 :
801 : /**
802 : * folio_mapcount() - Calculate the number of mappings of this folio.
803 : * @folio: The folio.
804 : *
805 : * A large folio tracks both how many times the entire folio is mapped,
806 : * and how many times each individual page in the folio is mapped.
807 : * This function calculates the total number of times the folio is
808 : * mapped.
809 : *
810 : * Return: The number of times this folio is mapped.
811 : */
812 0 : int folio_mapcount(struct folio *folio)
813 : {
814 : int i, compound, nr, ret;
815 :
816 0 : if (likely(!folio_test_large(folio)))
817 0 : return atomic_read(&folio->_mapcount) + 1;
818 :
819 0 : compound = folio_entire_mapcount(folio);
820 0 : nr = folio_nr_pages(folio);
821 0 : if (folio_test_hugetlb(folio))
822 : return compound;
823 0 : ret = compound;
824 0 : for (i = 0; i < nr; i++)
825 0 : ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
826 : /* File pages has compound_mapcount included in _mapcount */
827 0 : if (!folio_test_anon(folio))
828 0 : return ret - compound * nr;
829 : if (folio_test_double_map(folio))
830 : ret -= nr;
831 : return ret;
832 : }
833 :
834 : /**
835 : * folio_copy - Copy the contents of one folio to another.
836 : * @dst: Folio to copy to.
837 : * @src: Folio to copy from.
838 : *
839 : * The bytes in the folio represented by @src are copied to @dst.
840 : * Assumes the caller has validated that @dst is at least as large as @src.
841 : * Can be called in atomic context for order-0 folios, but if the folio is
842 : * larger, it may sleep.
843 : */
844 0 : void folio_copy(struct folio *dst, struct folio *src)
845 : {
846 0 : long i = 0;
847 0 : long nr = folio_nr_pages(src);
848 :
849 : for (;;) {
850 0 : copy_highpage(folio_page(dst, i), folio_page(src, i));
851 0 : if (++i == nr)
852 : break;
853 0 : cond_resched();
854 : }
855 0 : }
856 :
857 : int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
858 : int sysctl_overcommit_ratio __read_mostly = 50;
859 : unsigned long sysctl_overcommit_kbytes __read_mostly;
860 : int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
861 : unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
862 : unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
863 :
864 0 : int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
865 : size_t *lenp, loff_t *ppos)
866 : {
867 : int ret;
868 :
869 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
870 0 : if (ret == 0 && write)
871 0 : sysctl_overcommit_kbytes = 0;
872 0 : return ret;
873 : }
874 :
875 0 : static void sync_overcommit_as(struct work_struct *dummy)
876 : {
877 0 : percpu_counter_sync(&vm_committed_as);
878 0 : }
879 :
880 0 : int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
881 : size_t *lenp, loff_t *ppos)
882 : {
883 : struct ctl_table t;
884 0 : int new_policy = -1;
885 : int ret;
886 :
887 : /*
888 : * The deviation of sync_overcommit_as could be big with loose policy
889 : * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
890 : * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
891 : * with the strict "NEVER", and to avoid possible race condition (even
892 : * though user usually won't too frequently do the switching to policy
893 : * OVERCOMMIT_NEVER), the switch is done in the following order:
894 : * 1. changing the batch
895 : * 2. sync percpu count on each CPU
896 : * 3. switch the policy
897 : */
898 0 : if (write) {
899 0 : t = *table;
900 0 : t.data = &new_policy;
901 0 : ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
902 0 : if (ret || new_policy == -1)
903 : return ret;
904 :
905 0 : mm_compute_batch(new_policy);
906 0 : if (new_policy == OVERCOMMIT_NEVER)
907 0 : schedule_on_each_cpu(sync_overcommit_as);
908 0 : sysctl_overcommit_memory = new_policy;
909 : } else {
910 0 : ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
911 : }
912 :
913 : return ret;
914 : }
915 :
916 0 : int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
917 : size_t *lenp, loff_t *ppos)
918 : {
919 : int ret;
920 :
921 0 : ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
922 0 : if (ret == 0 && write)
923 0 : sysctl_overcommit_ratio = 0;
924 0 : return ret;
925 : }
926 :
927 : /*
928 : * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
929 : */
930 0 : unsigned long vm_commit_limit(void)
931 : {
932 : unsigned long allowed;
933 :
934 0 : if (sysctl_overcommit_kbytes)
935 0 : allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
936 : else
937 0 : allowed = ((totalram_pages() - hugetlb_total_pages())
938 0 : * sysctl_overcommit_ratio / 100);
939 0 : allowed += total_swap_pages;
940 :
941 0 : return allowed;
942 : }
943 :
944 : /*
945 : * Make sure vm_committed_as in one cacheline and not cacheline shared with
946 : * other variables. It can be updated by several CPUs frequently.
947 : */
948 : struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
949 :
950 : /*
951 : * The global memory commitment made in the system can be a metric
952 : * that can be used to drive ballooning decisions when Linux is hosted
953 : * as a guest. On Hyper-V, the host implements a policy engine for dynamically
954 : * balancing memory across competing virtual machines that are hosted.
955 : * Several metrics drive this policy engine including the guest reported
956 : * memory commitment.
957 : *
958 : * The time cost of this is very low for small platforms, and for big
959 : * platform like a 2S/36C/72T Skylake server, in worst case where
960 : * vm_committed_as's spinlock is under severe contention, the time cost
961 : * could be about 30~40 microseconds.
962 : */
963 0 : unsigned long vm_memory_committed(void)
964 : {
965 0 : return percpu_counter_sum_positive(&vm_committed_as);
966 : }
967 : EXPORT_SYMBOL_GPL(vm_memory_committed);
968 :
969 : /*
970 : * Check that a process has enough memory to allocate a new virtual
971 : * mapping. 0 means there is enough memory for the allocation to
972 : * succeed and -ENOMEM implies there is not.
973 : *
974 : * We currently support three overcommit policies, which are set via the
975 : * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
976 : *
977 : * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
978 : * Additional code 2002 Jul 20 by Robert Love.
979 : *
980 : * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
981 : *
982 : * Note this is a helper function intended to be used by LSMs which
983 : * wish to use this logic.
984 : */
985 0 : int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
986 : {
987 : long allowed;
988 :
989 0 : vm_acct_memory(pages);
990 :
991 : /*
992 : * Sometimes we want to use more memory than we have
993 : */
994 0 : if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
995 : return 0;
996 :
997 0 : if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
998 0 : if (pages > totalram_pages() + total_swap_pages)
999 : goto error;
1000 : return 0;
1001 : }
1002 :
1003 0 : allowed = vm_commit_limit();
1004 : /*
1005 : * Reserve some for root
1006 : */
1007 0 : if (!cap_sys_admin)
1008 0 : allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1009 :
1010 : /*
1011 : * Don't let a single process grow so big a user can't recover
1012 : */
1013 0 : if (mm) {
1014 0 : long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
1015 :
1016 0 : allowed -= min_t(long, mm->total_vm / 32, reserve);
1017 : }
1018 :
1019 0 : if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1020 : return 0;
1021 : error:
1022 0 : vm_unacct_memory(pages);
1023 :
1024 0 : return -ENOMEM;
1025 : }
1026 :
1027 : /**
1028 : * get_cmdline() - copy the cmdline value to a buffer.
1029 : * @task: the task whose cmdline value to copy.
1030 : * @buffer: the buffer to copy to.
1031 : * @buflen: the length of the buffer. Larger cmdline values are truncated
1032 : * to this length.
1033 : *
1034 : * Return: the size of the cmdline field copied. Note that the copy does
1035 : * not guarantee an ending NULL byte.
1036 : */
1037 0 : int get_cmdline(struct task_struct *task, char *buffer, int buflen)
1038 : {
1039 0 : int res = 0;
1040 : unsigned int len;
1041 0 : struct mm_struct *mm = get_task_mm(task);
1042 : unsigned long arg_start, arg_end, env_start, env_end;
1043 0 : if (!mm)
1044 : goto out;
1045 0 : if (!mm->arg_end)
1046 : goto out_mm; /* Shh! No looking before we're done */
1047 :
1048 0 : spin_lock(&mm->arg_lock);
1049 0 : arg_start = mm->arg_start;
1050 0 : arg_end = mm->arg_end;
1051 0 : env_start = mm->env_start;
1052 0 : env_end = mm->env_end;
1053 0 : spin_unlock(&mm->arg_lock);
1054 :
1055 0 : len = arg_end - arg_start;
1056 :
1057 0 : if (len > buflen)
1058 0 : len = buflen;
1059 :
1060 0 : res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
1061 :
1062 : /*
1063 : * If the nul at the end of args has been overwritten, then
1064 : * assume application is using setproctitle(3).
1065 : */
1066 0 : if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
1067 0 : len = strnlen(buffer, res);
1068 0 : if (len < res) {
1069 0 : res = len;
1070 : } else {
1071 0 : len = env_end - env_start;
1072 0 : if (len > buflen - res)
1073 0 : len = buflen - res;
1074 0 : res += access_process_vm(task, env_start,
1075 0 : buffer+res, len,
1076 : FOLL_FORCE);
1077 0 : res = strnlen(buffer, res);
1078 : }
1079 : }
1080 : out_mm:
1081 0 : mmput(mm);
1082 : out:
1083 0 : return res;
1084 : }
1085 :
1086 0 : int __weak memcmp_pages(struct page *page1, struct page *page2)
1087 : {
1088 : char *addr1, *addr2;
1089 : int ret;
1090 :
1091 0 : addr1 = kmap_atomic(page1);
1092 0 : addr2 = kmap_atomic(page2);
1093 0 : ret = memcmp(addr1, addr2, PAGE_SIZE);
1094 0 : kunmap_atomic(addr2);
1095 0 : kunmap_atomic(addr1);
1096 0 : return ret;
1097 : }
1098 :
1099 : #ifdef CONFIG_PRINTK
1100 : /**
1101 : * mem_dump_obj - Print available provenance information
1102 : * @object: object for which to find provenance information.
1103 : *
1104 : * This function uses pr_cont(), so that the caller is expected to have
1105 : * printed out whatever preamble is appropriate. The provenance information
1106 : * depends on the type of object and on how much debugging is enabled.
1107 : * For example, for a slab-cache object, the slab name is printed, and,
1108 : * if available, the return address and stack trace from the allocation
1109 : * and last free path of that object.
1110 : */
1111 0 : void mem_dump_obj(void *object)
1112 : {
1113 : const char *type;
1114 :
1115 0 : if (kmem_valid_obj(object)) {
1116 0 : kmem_dump_obj(object);
1117 0 : return;
1118 : }
1119 :
1120 0 : if (vmalloc_dump_obj(object))
1121 : return;
1122 :
1123 0 : if (virt_addr_valid(object))
1124 : type = "non-slab/vmalloc memory";
1125 0 : else if (object == NULL)
1126 : type = "NULL pointer";
1127 0 : else if (object == ZERO_SIZE_PTR)
1128 : type = "zero-size pointer";
1129 : else
1130 0 : type = "non-paged memory";
1131 :
1132 0 : pr_cont(" %s\n", type);
1133 : }
1134 : EXPORT_SYMBOL_GPL(mem_dump_obj);
1135 : #endif
1136 :
1137 : /*
1138 : * A driver might set a page logically offline -- PageOffline() -- and
1139 : * turn the page inaccessible in the hypervisor; after that, access to page
1140 : * content can be fatal.
1141 : *
1142 : * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1143 : * pages after checking PageOffline(); however, these PFN walkers can race
1144 : * with drivers that set PageOffline().
1145 : *
1146 : * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1147 : * synchronize with such drivers, achieving that a page cannot be set
1148 : * PageOffline() while frozen.
1149 : *
1150 : * page_offline_begin()/page_offline_end() is used by drivers that care about
1151 : * such races when setting a page PageOffline().
1152 : */
1153 : static DECLARE_RWSEM(page_offline_rwsem);
1154 :
1155 0 : void page_offline_freeze(void)
1156 : {
1157 0 : down_read(&page_offline_rwsem);
1158 0 : }
1159 :
1160 0 : void page_offline_thaw(void)
1161 : {
1162 0 : up_read(&page_offline_rwsem);
1163 0 : }
1164 :
1165 0 : void page_offline_begin(void)
1166 : {
1167 0 : down_write(&page_offline_rwsem);
1168 0 : }
1169 : EXPORT_SYMBOL(page_offline_begin);
1170 :
1171 0 : void page_offline_end(void)
1172 : {
1173 0 : up_write(&page_offline_rwsem);
1174 0 : }
1175 : EXPORT_SYMBOL(page_offline_end);
1176 :
1177 : #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
1178 : void flush_dcache_folio(struct folio *folio)
1179 : {
1180 : long i, nr = folio_nr_pages(folio);
1181 :
1182 : for (i = 0; i < nr; i++)
1183 : flush_dcache_page(folio_page(folio, i));
1184 : }
1185 : EXPORT_SYMBOL(flush_dcache_folio);
1186 : #endif
|