vmalloc.c source code [linux/mm/vmalloc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 1993 Linus Torvalds
4	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5	* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
6	* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
7	* Numa awareness, Christoph Lameter, SGI, June 2005
8	* Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
9	*/
10
11	#include <linux/vmalloc.h>
12	#include <linux/mm.h>
13	#include <linux/module.h>
14	#include <linux/highmem.h>
15	#include <linux/sched/signal.h>
16	#include <linux/slab.h>
17	#include <linux/spinlock.h>
18	#include <linux/interrupt.h>
19	#include <linux/proc_fs.h>
20	#include <linux/seq_file.h>
21	#include <linux/set_memory.h>
22	#include <linux/debugobjects.h>
23	#include <linux/kallsyms.h>
24	#include <linux/list.h>
25	#include <linux/notifier.h>
26	#include <linux/rbtree.h>
27	#include <linux/xarray.h>
28	#include <linux/io.h>
29	#include <linux/rcupdate.h>
30	#include <linux/pfn.h>
31	#include <linux/kmemleak.h>
32	#include <linux/atomic.h>
33	#include <linux/compiler.h>
34	#include <linux/memcontrol.h>
35	#include <linux/llist.h>
36	#include <linux/uio.h>
37	#include <linux/bitops.h>
38	#include <linux/rbtree_augmented.h>
39	#include <linux/overflow.h>
40	#include <linux/pgtable.h>
41	#include <linux/hugetlb.h>
42	#include <linux/sched/mm.h>
43	#include <asm/tlbflush.h>
44	#include <asm/shmparam.h>
45	#include <linux/page_owner.h>
46
47	#define CREATE_TRACE_POINTS
48	#include <trace/events/vmalloc.h>
49
50	#include "internal.h"
51	#include "pgalloc-track.h"
52
53	#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
54	static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - `1`;
55
56	static int __init set_nohugeiomap(char *str)
57	{
58	ioremap_max_page_shift = PAGE_SHIFT;
59	return `0`;
60	}
61	early_param("nohugeiomap", set_nohugeiomap);
62	#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
63	static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
64	#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
65
66	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
67	static bool __ro_after_init vmap_allow_huge = true;
68
69	static int __init set_nohugevmalloc(char *str)
70	{
71	vmap_allow_huge = false;
72	return `0`;
73	}
74	early_param("nohugevmalloc", set_nohugevmalloc);
75	#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
76	static const bool vmap_allow_huge = false;
77	#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
78
79	bool is_vmalloc_addr(const void *x)
80	{
81	unsigned long addr = (unsigned long)kasan_reset_tag(addr: x);
82
83	return addr >= VMALLOC_START && addr < VMALLOC_END;
84	}
85	EXPORT_SYMBOL(is_vmalloc_addr);
86
87	struct vfree_deferred {
88	struct llist_head list;
89	struct work_struct wq;
90	};
91	static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
92
93	/ Page table manipulation functions /
94	static int vmap_pte_range(pmd_t pmd, unsigned* long addr, unsigned long end,
95	phys_addr_t phys_addr, pgprot_t prot,
96	unsigned int max_page_shift, pgtbl_mod_mask *mask)
97	{
98	pte_t *pte;
99	u64 pfn;
100	struct page *page;
101	unsigned long size = PAGE_SIZE;
102
103	pfn = phys_addr >> PAGE_SHIFT;
104	pte = pte_alloc_kernel_track(pmd, addr, mask);
105	if (!pte)
106	return -ENOMEM;
107
108	arch_enter_lazy_mmu_mode();
109
110	do {
111	if (unlikely(!pte_none(ptep_get(pte)))) {
112	if (pfn_valid(pfn)) {
113	page = pfn_to_page(pfn);
114	dump_page(page, reason: "remapping already mapped page");
115	}
116	BUG();
117	}
118
119	#ifdef CONFIG_HUGETLB_PAGE
120	size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
121	if (size != PAGE_SIZE) {
122	pte_t entry = pfn_pte(page_nr: pfn, pgprot: prot);
123
124	entry = arch_make_huge_pte(entry, ilog2(size), flags: `0`);
125	set_huge_pte_at(mm: &init_mm, addr, ptep: pte, pte: entry, sz: size);
126	pfn += PFN_DOWN(size);
127	continue;
128	}
129	#endif
130	set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
131	pfn++;
132	} while (pte += PFN_DOWN(size), addr += size, addr != end);
133
134	arch_leave_lazy_mmu_mode();
135	*mask \|= PGTBL_PTE_MODIFIED;
136	return `0`;
137	}
138
139	static int vmap_try_huge_pmd(pmd_t pmd, unsigned* long addr, unsigned long end,
140	phys_addr_t phys_addr, pgprot_t prot,
141	unsigned int max_page_shift)
142	{
143	if (max_page_shift < PMD_SHIFT)
144	return `0`;
145
146	if (!arch_vmap_pmd_supported(prot))
147	return `0`;
148
149	if ((end - addr) != PMD_SIZE)
150	return `0`;
151
152	if (!IS_ALIGNED(addr, PMD_SIZE))
153	return `0`;
154
155	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
156	return `0`;
157
158	if (pmd_present(pmd: *pmd) && !pmd_free_pte_page(pmd, addr))
159	return `0`;
160
161	return pmd_set_huge(pmd, addr: phys_addr, prot);
162	}
163
164	static int vmap_pmd_range(pud_t pud, unsigned* long addr, unsigned long end,
165	phys_addr_t phys_addr, pgprot_t prot,
166	unsigned int max_page_shift, pgtbl_mod_mask *mask)
167	{
168	pmd_t *pmd;
169	unsigned long next;
170
171	pmd = pmd_alloc_track(mm: &init_mm, pud, address: addr, mod_mask: mask);
172	if (!pmd)
173	return -ENOMEM;
174	do {
175	next = pmd_addr_end(addr, end);
176
177	if (vmap_try_huge_pmd(pmd, addr, end: next, phys_addr, prot,
178	max_page_shift)) {
179	*mask \|= PGTBL_PMD_MODIFIED;
180	continue;
181	}
182
183	if (vmap_pte_range(pmd, addr, end: next, phys_addr, prot, max_page_shift, mask))
184	return -ENOMEM;
185	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
186	return `0`;
187	}
188
189	static int vmap_try_huge_pud(pud_t pud, unsigned* long addr, unsigned long end,
190	phys_addr_t phys_addr, pgprot_t prot,
191	unsigned int max_page_shift)
192	{
193	if (max_page_shift < PUD_SHIFT)
194	return `0`;
195
196	if (!arch_vmap_pud_supported(prot))
197	return `0`;
198
199	if ((end - addr) != PUD_SIZE)
200	return `0`;
201
202	if (!IS_ALIGNED(addr, PUD_SIZE))
203	return `0`;
204
205	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
206	return `0`;
207
208	if (pud_present(pud: *pud) && !pud_free_pmd_page(pud, addr))
209	return `0`;
210
211	return pud_set_huge(pud, addr: phys_addr, prot);
212	}
213
214	static int vmap_pud_range(p4d_t p4d, unsigned* long addr, unsigned long end,
215	phys_addr_t phys_addr, pgprot_t prot,
216	unsigned int max_page_shift, pgtbl_mod_mask *mask)
217	{
218	pud_t *pud;
219	unsigned long next;
220
221	pud = pud_alloc_track(mm: &init_mm, p4d, address: addr, mod_mask: mask);
222	if (!pud)
223	return -ENOMEM;
224	do {
225	next = pud_addr_end(addr, end);
226
227	if (vmap_try_huge_pud(pud, addr, end: next, phys_addr, prot,
228	max_page_shift)) {
229	*mask \|= PGTBL_PUD_MODIFIED;
230	continue;
231	}
232
233	if (vmap_pmd_range(pud, addr, end: next, phys_addr, prot,
234	max_page_shift, mask))
235	return -ENOMEM;
236	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
237	return `0`;
238	}
239
240	static int vmap_try_huge_p4d(p4d_t p4d, unsigned* long addr, unsigned long end,
241	phys_addr_t phys_addr, pgprot_t prot,
242	unsigned int max_page_shift)
243	{
244	if (max_page_shift < P4D_SHIFT)
245	return `0`;
246
247	if (!arch_vmap_p4d_supported(prot))
248	return `0`;
249
250	if ((end - addr) != P4D_SIZE)
251	return `0`;
252
253	if (!IS_ALIGNED(addr, P4D_SIZE))
254	return `0`;
255
256	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
257	return `0`;
258
259	if (p4d_present(p4d: *p4d) && !p4d_free_pud_page(p4d, addr))
260	return `0`;
261
262	return p4d_set_huge(p4d, addr: phys_addr, prot);
263	}
264
265	static int vmap_p4d_range(pgd_t pgd, unsigned* long addr, unsigned long end,
266	phys_addr_t phys_addr, pgprot_t prot,
267	unsigned int max_page_shift, pgtbl_mod_mask *mask)
268	{
269	p4d_t *p4d;
270	unsigned long next;
271
272	p4d = p4d_alloc_track(mm: &init_mm, pgd, address: addr, mod_mask: mask);
273	if (!p4d)
274	return -ENOMEM;
275	do {
276	next = p4d_addr_end(addr, end);
277
278	if (vmap_try_huge_p4d(p4d, addr, end: next, phys_addr, prot,
279	max_page_shift)) {
280	*mask \|= PGTBL_P4D_MODIFIED;
281	continue;
282	}
283
284	if (vmap_pud_range(p4d, addr, end: next, phys_addr, prot,
285	max_page_shift, mask))
286	return -ENOMEM;
287	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
288	return `0`;
289	}
290
291	static int vmap_range_noflush(unsigned long addr, unsigned long end,
292	phys_addr_t phys_addr, pgprot_t prot,
293	unsigned int max_page_shift)
294	{
295	pgd_t *pgd;
296	unsigned long start;
297	unsigned long next;
298	int err;
299	pgtbl_mod_mask mask = `0`;
300
301	might_sleep();
302	BUG_ON(addr >= end);
303
304	start = addr;
305	pgd = pgd_offset_k(addr);
306	do {
307	next = pgd_addr_end(addr, end);
308	err = vmap_p4d_range(pgd, addr, end: next, phys_addr, prot,
309	max_page_shift, mask: &mask);
310	if (err)
311	break;
312	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
313
314	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
315	arch_sync_kernel_mappings(start, end);
316
317	return err;
318	}
319
320	int vmap_page_range(unsigned long addr, unsigned long end,
321	phys_addr_t phys_addr, pgprot_t prot)
322	{
323	int err;
324
325	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
326	max_page_shift: ioremap_max_page_shift);
327	flush_cache_vmap(start: addr, end);
328	if (!err)
329	err = kmsan_ioremap_page_range(start: addr, end, phys_addr, prot,
330	page_shift: ioremap_max_page_shift);
331	return err;
332	}
333
334	int ioremap_page_range(unsigned long addr, unsigned long end,
335	phys_addr_t phys_addr, pgprot_t prot)
336	{
337	struct vm_struct *area;
338
339	area = find_vm_area(addr: (void *)addr);
340	if (!area \|\| !(area->flags & VM_IOREMAP)) {
341	WARN_ONCE(`1`, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
342	return -EINVAL;
343	}
344	if (addr != (unsigned long)area->addr \|\|
345	(void *)end != area->addr + get_vm_area_size(area)) {
346	WARN_ONCE(`1`, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
347	addr, end, (long)area->addr,
348	(long)area->addr + get_vm_area_size(area));
349	return -ERANGE;
350	}
351	return vmap_page_range(addr, end, phys_addr, prot);
352	}
353
354	static void vunmap_pte_range(pmd_t pmd, unsigned* long addr, unsigned long end,
355	pgtbl_mod_mask *mask)
356	{
357	pte_t *pte;
358	pte_t ptent;
359	unsigned long size = PAGE_SIZE;
360
361	pte = pte_offset_kernel(pmd, address: addr);
362	arch_enter_lazy_mmu_mode();
363
364	do {
365	#ifdef CONFIG_HUGETLB_PAGE
366	size = arch_vmap_pte_range_unmap_size(addr, ptep: pte);
367	if (size != PAGE_SIZE) {
368	if (WARN_ON(!IS_ALIGNED(addr, size))) {
369	addr = ALIGN_DOWN(addr, size);
370	pte = PTR_ALIGN_DOWN(pte, sizeof(pte) (size >> PAGE_SHIFT));
371	}
372	ptent = huge_ptep_get_and_clear(mm: &init_mm, addr, ptep: pte, sz: size);
373	if (WARN_ON(end - addr < size))
374	size = end - addr;
375	} else
376	#endif
377	ptent = ptep_get_and_clear(mm: &init_mm, addr, ptep: pte);
378	WARN_ON(!pte_none(ptent) && !pte_present(ptent));
379	} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
380
381	arch_leave_lazy_mmu_mode();
382	*mask \|= PGTBL_PTE_MODIFIED;
383	}
384
385	static void vunmap_pmd_range(pud_t pud, unsigned* long addr, unsigned long end,
386	pgtbl_mod_mask *mask)
387	{
388	pmd_t *pmd;
389	unsigned long next;
390	int cleared;
391
392	pmd = pmd_offset(pud, address: addr);
393	do {
394	next = pmd_addr_end(addr, end);
395
396	cleared = pmd_clear_huge(pmd);
397	if (cleared \|\| pmd_bad(pmd: *pmd))
398	*mask \|= PGTBL_PMD_MODIFIED;
399
400	if (cleared) {
401	WARN_ON(next - addr < PMD_SIZE);
402	continue;
403	}
404	if (pmd_none_or_clear_bad(pmd))
405	continue;
406	vunmap_pte_range(pmd, addr, end: next, mask);
407
408	cond_resched();
409	} while (pmd++, addr = next, addr != end);
410	}
411
412	static void vunmap_pud_range(p4d_t p4d, unsigned* long addr, unsigned long end,
413	pgtbl_mod_mask *mask)
414	{
415	pud_t *pud;
416	unsigned long next;
417	int cleared;
418
419	pud = pud_offset(p4d, address: addr);
420	do {
421	next = pud_addr_end(addr, end);
422
423	cleared = pud_clear_huge(pud);
424	if (cleared \|\| pud_bad(pud: *pud))
425	*mask \|= PGTBL_PUD_MODIFIED;
426
427	if (cleared) {
428	WARN_ON(next - addr < PUD_SIZE);
429	continue;
430	}
431	if (pud_none_or_clear_bad(pud))
432	continue;
433	vunmap_pmd_range(pud, addr, end: next, mask);
434	} while (pud++, addr = next, addr != end);
435	}
436
437	static void vunmap_p4d_range(pgd_t pgd, unsigned* long addr, unsigned long end,
438	pgtbl_mod_mask *mask)
439	{
440	p4d_t *p4d;
441	unsigned long next;
442
443	p4d = p4d_offset(pgd, address: addr);
444	do {
445	next = p4d_addr_end(addr, end);
446
447	p4d_clear_huge(p4d);
448	if (p4d_bad(p4d: *p4d))
449	*mask \|= PGTBL_P4D_MODIFIED;
450
451	if (p4d_none_or_clear_bad(p4d))
452	continue;
453	vunmap_pud_range(p4d, addr, end: next, mask);
454	} while (p4d++, addr = next, addr != end);
455	}
456
457	/*
458	* vunmap_range_noflush is similar to vunmap_range, but does not
459	* flush caches or TLBs.
460	*
461	* The caller is responsible for calling flush_cache_vmap() before calling
462	* this function, and flush_tlb_kernel_range after it has returned
463	* successfully (and before the addresses are expected to cause a page fault
464	* or be re-mapped for something else, if TLB flushes are being delayed or
465	* coalesced).
466	*
467	* This is an internal function only. Do not use outside mm/.
468	*/
469	void __vunmap_range_noflush(unsigned long start, unsigned long end)
470	{
471	unsigned long next;
472	pgd_t *pgd;
473	unsigned long addr = start;
474	pgtbl_mod_mask mask = `0`;
475
476	BUG_ON(addr >= end);
477	pgd = pgd_offset_k(addr);
478	do {
479	next = pgd_addr_end(addr, end);
480	if (pgd_bad(pgd: *pgd))
481	mask \|= PGTBL_PGD_MODIFIED;
482	if (pgd_none_or_clear_bad(pgd))
483	continue;
484	vunmap_p4d_range(pgd, addr, end: next, mask: &mask);
485	} while (pgd++, addr = next, addr != end);
486
487	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
488	arch_sync_kernel_mappings(start, end);
489	}
490
491	void vunmap_range_noflush(unsigned long start, unsigned long end)
492	{
493	kmsan_vunmap_range_noflush(start, end);
494	__vunmap_range_noflush(start, end);
495	}
496
497	/**
498	* vunmap_range - unmap kernel virtual addresses
499	* @addr: start of the VM area to unmap
500	* @end: end of the VM area to unmap (non-inclusive)
501	*
502	* Clears any present PTEs in the virtual address range, flushes TLBs and
503	* caches. Any subsequent access to the address before it has been re-mapped
504	* is a kernel bug.
505	*/
506	void vunmap_range(unsigned long addr, unsigned long end)
507	{
508	flush_cache_vunmap(start: addr, end);
509	vunmap_range_noflush(start: addr, end);
510	flush_tlb_kernel_range(start: addr, end);
511	}
512
513	static int vmap_pages_pte_range(pmd_t pmd, unsigned* long addr,
514	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
515	pgtbl_mod_mask *mask)
516	{
517	pte_t *pte;
518
519	/*
520	* nr is a running index into the array which helps higher level
521	* callers keep track of where we're up to.
522	*/
523
524	pte = pte_alloc_kernel_track(pmd, addr, mask);
525	if (!pte)
526	return -ENOMEM;
527
528	arch_enter_lazy_mmu_mode();
529
530	do {
531	struct page page = pages[nr];
532
533	if (WARN_ON(!pte_none(ptep_get(pte))))
534	return -EBUSY;
535	if (WARN_ON(!page))
536	return -ENOMEM;
537	if (WARN_ON(!pfn_valid(page_to_pfn(page))))
538	return -EINVAL;
539
540	set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
541	(*nr)++;
542	} while (pte++, addr += PAGE_SIZE, addr != end);
543
544	arch_leave_lazy_mmu_mode();
545	*mask \|= PGTBL_PTE_MODIFIED;
546	return `0`;
547	}
548
549	static int vmap_pages_pmd_range(pud_t pud, unsigned* long addr,
550	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
551	pgtbl_mod_mask *mask)
552	{
553	pmd_t *pmd;
554	unsigned long next;
555
556	pmd = pmd_alloc_track(mm: &init_mm, pud, address: addr, mod_mask: mask);
557	if (!pmd)
558	return -ENOMEM;
559	do {
560	next = pmd_addr_end(addr, end);
561	if (vmap_pages_pte_range(pmd, addr, end: next, prot, pages, nr, mask))
562	return -ENOMEM;
563	} while (pmd++, addr = next, addr != end);
564	return `0`;
565	}
566
567	static int vmap_pages_pud_range(p4d_t p4d, unsigned* long addr,
568	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
569	pgtbl_mod_mask *mask)
570	{
571	pud_t *pud;
572	unsigned long next;
573
574	pud = pud_alloc_track(mm: &init_mm, p4d, address: addr, mod_mask: mask);
575	if (!pud)
576	return -ENOMEM;
577	do {
578	next = pud_addr_end(addr, end);
579	if (vmap_pages_pmd_range(pud, addr, end: next, prot, pages, nr, mask))
580	return -ENOMEM;
581	} while (pud++, addr = next, addr != end);
582	return `0`;
583	}
584
585	static int vmap_pages_p4d_range(pgd_t pgd, unsigned* long addr,
586	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
587	pgtbl_mod_mask *mask)
588	{
589	p4d_t *p4d;
590	unsigned long next;
591
592	p4d = p4d_alloc_track(mm: &init_mm, pgd, address: addr, mod_mask: mask);
593	if (!p4d)
594	return -ENOMEM;
595	do {
596	next = p4d_addr_end(addr, end);
597	if (vmap_pages_pud_range(p4d, addr, end: next, prot, pages, nr, mask))
598	return -ENOMEM;
599	} while (p4d++, addr = next, addr != end);
600	return `0`;
601	}
602
603	static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
604	pgprot_t prot, struct page **pages)
605	{
606	unsigned long start = addr;
607	pgd_t *pgd;
608	unsigned long next;
609	int err = `0`;
610	int nr = `0`;
611	pgtbl_mod_mask mask = `0`;
612
613	BUG_ON(addr >= end);
614	pgd = pgd_offset_k(addr);
615	do {
616	next = pgd_addr_end(addr, end);
617	if (pgd_bad(pgd: *pgd))
618	mask \|= PGTBL_PGD_MODIFIED;
619	err = vmap_pages_p4d_range(pgd, addr, end: next, prot, pages, nr: &nr, mask: &mask);
620	if (err)
621	break;
622	} while (pgd++, addr = next, addr != end);
623
624	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
625	arch_sync_kernel_mappings(start, end);
626
627	return err;
628	}
629
630	/*
631	* vmap_pages_range_noflush is similar to vmap_pages_range, but does not
632	* flush caches.
633	*
634	* The caller is responsible for calling flush_cache_vmap() after this
635	* function returns successfully and before the addresses are accessed.
636	*
637	* This is an internal function only. Do not use outside mm/.
638	*/
639	int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
640	pgprot_t prot, struct page *pages, unsigned* int page_shift)
641	{
642	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
643
644	WARN_ON(page_shift < PAGE_SHIFT);
645
646	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) \|\|
647	page_shift == PAGE_SHIFT)
648	return vmap_small_pages_range_noflush(addr, end, prot, pages);
649
650	for (i = `0`; i < nr; i += `1U` << (page_shift - PAGE_SHIFT)) {
651	int err;
652
653	err = vmap_range_noflush(addr, end: addr + (`1UL` << page_shift),
654	page_to_phys(pages[i]), prot,
655	max_page_shift: page_shift);
656	if (err)
657	return err;
658
659	addr += `1UL` << page_shift;
660	}
661
662	return `0`;
663	}
664
665	int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
666	pgprot_t prot, struct page *pages, unsigned* int page_shift)
667	{
668	int ret = kmsan_vmap_pages_range_noflush(start: addr, end, prot, pages,
669	page_shift);
670
671	if (ret)
672	return ret;
673	return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
674	}
675
676	/**
677	* vmap_pages_range - map pages to a kernel virtual address
678	* @addr: start of the VM area to map
679	* @end: end of the VM area to map (non-inclusive)
680	* @prot: page protection flags to use
681	* @pages: pages to map (always PAGE_SIZE pages)
682	* @page_shift: maximum shift that the pages may be mapped with, @pages must
683	* be aligned and contiguous up to at least this shift.
684	*
685	* RETURNS:
686	* 0 on success, -errno on failure.
687	*/
688	int vmap_pages_range(unsigned long addr, unsigned long end,
689	pgprot_t prot, struct page *pages, unsigned* int page_shift)
690	{
691	int err;
692
693	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
694	flush_cache_vmap(start: addr, end);
695	return err;
696	}
697
698	static int check_sparse_vm_area(struct vm_struct area, unsigned* long start,
699	unsigned long end)
700	{
701	might_sleep();
702	if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
703	return -EINVAL;
704	if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
705	return -EINVAL;
706	if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
707	return -EINVAL;
708	if ((end - start) >> PAGE_SHIFT > totalram_pages())
709	return -E2BIG;
710	if (start < (unsigned long)area->addr \|\|
711	(void *)end > area->addr + get_vm_area_size(area))
712	return -ERANGE;
713	return `0`;
714	}
715
716	/**
717	* vm_area_map_pages - map pages inside given sparse vm_area
718	* @area: vm_area
719	* @start: start address inside vm_area
720	* @end: end address inside vm_area
721	* @pages: pages to map (always PAGE_SIZE pages)
722	*/
723	int vm_area_map_pages(struct vm_struct area, unsigned* long start,
724	unsigned long end, struct page **pages)
725	{
726	int err;
727
728	err = check_sparse_vm_area(area, start, end);
729	if (err)
730	return err;
731
732	return vmap_pages_range(addr: start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
733	}
734
735	/**
736	* vm_area_unmap_pages - unmap pages inside given sparse vm_area
737	* @area: vm_area
738	* @start: start address inside vm_area
739	* @end: end address inside vm_area
740	*/
741	void vm_area_unmap_pages(struct vm_struct area, unsigned* long start,
742	unsigned long end)
743	{
744	if (check_sparse_vm_area(area, start, end))
745	return;
746
747	vunmap_range(addr: start, end);
748	}
749
750	int is_vmalloc_or_module_addr(const void *x)
751	{
752	/*
753	* ARM, x86-64 and sparc64 put modules in a special place,
754	* and fall back on vmalloc() if that fails. Others
755	* just put it in the vmalloc space.
756	*/
757	#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
758	unsigned long addr = (unsigned long)kasan_reset_tag(addr: x);
759	if (addr >= MODULES_VADDR && addr < MODULES_END)
760	return `1`;
761	#endif
762	return is_vmalloc_addr(x);
763	}
764	EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
765
766	/*
767	* Walk a vmap address to the struct page it maps. Huge vmap mappings will
768	* return the tail page that corresponds to the base page address, which
769	* matches small vmap mappings.
770	*/
771	struct page vmalloc_to_page(const* void *vmalloc_addr)
772	{
773	unsigned long addr = (unsigned long) vmalloc_addr;
774	struct page *page = NULL;
775	pgd_t *pgd = pgd_offset_k(addr);
776	p4d_t *p4d;
777	pud_t *pud;
778	pmd_t *pmd;
779	pte_t *ptep, pte;
780
781	/*
782	* XXX we might need to change this if we add VIRTUAL_BUG_ON for
783	* architectures that do not vmalloc module space
784	*/
785	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
786
787	if (pgd_none(pgd: *pgd))
788	return NULL;
789	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
790	return NULL; / XXX: no allowance for huge pgd /
791	if (WARN_ON_ONCE(pgd_bad(*pgd)))
792	return NULL;
793
794	p4d = p4d_offset(pgd, address: addr);
795	if (p4d_none(p4d: *p4d))
796	return NULL;
797	if (p4d_leaf(*p4d))
798	return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
799	if (WARN_ON_ONCE(p4d_bad(*p4d)))
800	return NULL;
801
802	pud = pud_offset(p4d, address: addr);
803	if (pud_none(pud: *pud))
804	return NULL;
805	if (pud_leaf(pud: *pud))
806	return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
807	if (WARN_ON_ONCE(pud_bad(*pud)))
808	return NULL;
809
810	pmd = pmd_offset(pud, address: addr);
811	if (pmd_none(pmd: *pmd))
812	return NULL;
813	if (pmd_leaf(pte: *pmd))
814	return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
815	if (WARN_ON_ONCE(pmd_bad(*pmd)))
816	return NULL;
817
818	ptep = pte_offset_kernel(pmd, address: addr);
819	pte = ptep_get(ptep);
820	if (pte_present(a: pte))
821	page = pte_page(pte);
822
823	return page;
824	}
825	EXPORT_SYMBOL(vmalloc_to_page);
826
827	/*
828	* Map a vmalloc()-space virtual address to the physical page frame number.
829	*/
830	unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
831	{
832	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
833	}
834	EXPORT_SYMBOL(vmalloc_to_pfn);
835
836
837	/ Global kva allocator /
838
839	#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
840	#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
841
842
843	static DEFINE_SPINLOCK(free_vmap_area_lock);
844	static bool vmap_initialized __read_mostly;
845
846	/*
847	* This kmem_cache is used for vmap_area objects. Instead of
848	* allocating from slab we reuse an object from this cache to
849	* make things faster. Especially in "no edge" splitting of
850	* free block.
851	*/
852	static struct kmem_cache *vmap_area_cachep;
853
854	/*
855	* This linked list is used in pair with free_vmap_area_root.
856	* It gives O(1) access to prev/next to perform fast coalescing.
857	*/
858	static LIST_HEAD(free_vmap_area_list);
859
860	/*
861	* This augment red-black tree represents the free vmap space.
862	* All vmap_area objects in this tree are sorted by va->va_start
863	* address. It is used for allocation and merging when a vmap
864	* object is released.
865	*
866	* Each vmap_area node contains a maximum available free block
867	* of its sub-tree, right or left. Therefore it is possible to
868	* find a lowest match of free area.
869	*/
870	static struct rb_root free_vmap_area_root = RB_ROOT;
871
872	/*
873	* Preload a CPU with one object for "no edge" split case. The
874	* aim is to get rid of allocations from the atomic context, thus
875	* to use more permissive allocation masks.
876	*/
877	static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
878
879	/*
880	* This structure defines a single, solid model where a list and
881	* rb-tree are part of one entity protected by the lock. Nodes are
882	* sorted in ascending order, thus for O(1) access to left/right
883	* neighbors a list is used as well as for sequential traversal.
884	*/
885	struct rb_list {
886	struct rb_root root;
887	struct list_head head;
888	spinlock_t lock;
889	};
890
891	/*
892	* A fast size storage contains VAs up to 1M size. A pool consists
893	* of linked between each other ready to go VAs of certain sizes.
894	* An index in the pool-array corresponds to number of pages + 1.
895	*/
896	#define MAX_VA_SIZE_PAGES 256
897
898	struct vmap_pool {
899	struct list_head head;
900	unsigned long len;
901	};
902
903	/*
904	* An effective vmap-node logic. Users make use of nodes instead
905	* of a global heap. It allows to balance an access and mitigate
906	* contention.
907	*/
908	static struct vmap_node {
909	/ Simple size segregated storage. /
910	struct vmap_pool pool[MAX_VA_SIZE_PAGES];
911	spinlock_t pool_lock;
912	bool skip_populate;
913
914	/ Bookkeeping data of this node. /
915	struct rb_list busy;
916	struct rb_list lazy;
917
918	/*
919	* Ready-to-free areas.
920	*/
921	struct list_head purge_list;
922	struct work_struct purge_work;
923	unsigned long nr_purged;
924	} single;
925
926	/*
927	* Initial setup consists of one single node, i.e. a balancing
928	* is fully disabled. Later on, after vmap is initialized these
929	* parameters are updated based on a system capacity.
930	*/
931	static struct vmap_node *vmap_nodes = &single;
932	static __read_mostly unsigned int nr_vmap_nodes = `1`;
933	static __read_mostly unsigned int vmap_zone_size = `1`;
934
935	/ A simple iterator over all vmap-nodes. /
936	#define for_each_vmap_node(vn) \
937	for ((vn) = &vmap_nodes[0]; \
938	(vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
939
940	static inline unsigned int
941	addr_to_node_id(unsigned long addr)
942	{
943	return (addr / vmap_zone_size) % nr_vmap_nodes;
944	}
945
946	static inline struct vmap_node *
947	addr_to_node(unsigned long addr)
948	{
949	return &vmap_nodes[addr_to_node_id(addr)];
950	}
951
952	static inline struct vmap_node *
953	id_to_node(unsigned int id)
954	{
955	return &vmap_nodes[id % nr_vmap_nodes];
956	}
957
958	static inline unsigned int
959	node_to_id(struct vmap_node *node)
960	{
961	/ Pointer arithmetic. /
962	unsigned int id = node - vmap_nodes;
963
964	if (likely(id < nr_vmap_nodes))
965	return id;
966
967	WARN_ONCE(`1`, "An address 0x%p is out-of-bounds.\n", node);
968	return `0`;
969	}
970
971	/*
972	* We use the value 0 to represent "no node", that is why
973	* an encoded value will be the node-id incremented by 1.
974	* It is always greater then 0. A valid node_id which can
975	* be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
976	* is not valid 0 is returned.
977	*/
978	static unsigned int
979	encode_vn_id(unsigned int node_id)
980	{
981	/ Can store U8_MAX [0:254] nodes. /
982	if (node_id < nr_vmap_nodes)
983	return (node_id + `1`) << BITS_PER_BYTE;
984
985	/ Warn and no node encoded. /
986	WARN_ONCE(`1`, "Encode wrong node id (%u)\n", node_id);
987	return `0`;
988	}
989
990	/*
991	* Returns an encoded node-id, the valid range is within
992	* [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
993	* returned if extracted data is wrong.
994	*/
995	static unsigned int
996	decode_vn_id(unsigned int val)
997	{
998	unsigned int node_id = (val >> BITS_PER_BYTE) - `1`;
999
1000	/ Can store U8_MAX [0:254] nodes. /
1001	if (node_id < nr_vmap_nodes)
1002	return node_id;
1003
1004	/ If it was _not_ zero, warn. /
1005	WARN_ONCE(node_id != UINT_MAX,
1006	"Decode wrong node id (%d)\n", node_id);
1007
1008	return nr_vmap_nodes;
1009	}
1010
1011	static bool
1012	is_vn_id_valid(unsigned int node_id)
1013	{
1014	if (node_id < nr_vmap_nodes)
1015	return true;
1016
1017	return false;
1018	}
1019
1020	static __always_inline unsigned long
1021	va_size(struct vmap_area *va)
1022	{
1023	return (va->va_end - va->va_start);
1024	}
1025
1026	static __always_inline unsigned long
1027	get_subtree_max_size(struct rb_node *node)
1028	{
1029	struct vmap_area *va;
1030
1031	va = rb_entry_safe(node, struct vmap_area, rb_node);
1032	return va ? va->subtree_max_size : `0`;
1033	}
1034
1035	RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
1036	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
1037
1038	static void reclaim_and_purge_vmap_areas(void);
1039	static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
1040	static void drain_vmap_area_work(struct work_struct *work);
1041	static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
1042
1043	static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
1044	static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
1045
1046	unsigned long vmalloc_nr_pages(void)
1047	{
1048	return atomic_long_read(v: &nr_vmalloc_pages);
1049	}
1050
1051	static struct vmap_area __find_vmap_area(unsigned* long addr, struct rb_root *root)
1052	{
1053	struct rb_node *n = root->rb_node;
1054
1055	addr = (unsigned long)kasan_reset_tag(addr: (void *)addr);
1056
1057	while (n) {
1058	struct vmap_area *va;
1059
1060	va = rb_entry(n, struct vmap_area, rb_node);
1061	if (addr < va->va_start)
1062	n = n->rb_left;
1063	else if (addr >= va->va_end)
1064	n = n->rb_right;
1065	else
1066	return va;
1067	}
1068
1069	return NULL;
1070	}
1071
1072	/ Look up the first VA which satisfies addr < va_end, NULL if none. /
1073	static struct vmap_area *
1074	__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
1075	{
1076	struct vmap_area *va = NULL;
1077	struct rb_node *n = root->rb_node;
1078
1079	addr = (unsigned long)kasan_reset_tag(addr: (void *)addr);
1080
1081	while (n) {
1082	struct vmap_area *tmp;
1083
1084	tmp = rb_entry(n, struct vmap_area, rb_node);
1085	if (tmp->va_end > addr) {
1086	va = tmp;
1087	if (tmp->va_start <= addr)
1088	break;
1089
1090	n = n->rb_left;
1091	} else
1092	n = n->rb_right;
1093	}
1094
1095	return va;
1096	}
1097
1098	/*
1099	* Returns a node where a first VA, that satisfies addr < va_end, resides.
1100	* If success, a node is locked. A user is responsible to unlock it when a
1101	* VA is no longer needed to be accessed.
1102	*
1103	* Returns NULL if nothing found.
1104	*/
1105	static struct vmap_node *
1106	find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
1107	{
1108	unsigned long va_start_lowest;
1109	struct vmap_node *vn;
1110
1111	repeat:
1112	va_start_lowest = `0`;
1113
1114	for_each_vmap_node(vn) {
1115	spin_lock(lock: &vn->busy.lock);
1116	*va = __find_vmap_area_exceed_addr(addr, root: &vn->busy.root);
1117
1118	if (*va)
1119	if (!va_start_lowest \|\| (*va)->va_start < va_start_lowest)
1120	va_start_lowest = (*va)->va_start;
1121	spin_unlock(lock: &vn->busy.lock);
1122	}
1123
1124	/*
1125	* Check if found VA exists, it might have gone away. In this case we
1126	* repeat the search because a VA has been removed concurrently and we
1127	* need to proceed to the next one, which is a rare case.
1128	*/
1129	if (va_start_lowest) {
1130	vn = addr_to_node(addr: va_start_lowest);
1131
1132	spin_lock(lock: &vn->busy.lock);
1133	*va = __find_vmap_area(addr: va_start_lowest, root: &vn->busy.root);
1134
1135	if (*va)
1136	return vn;
1137
1138	spin_unlock(lock: &vn->busy.lock);
1139	goto repeat;
1140	}
1141
1142	return NULL;
1143	}
1144
1145	/*
1146	* This function returns back addresses of parent node
1147	* and its left or right link for further processing.
1148	*
1149	* Otherwise NULL is returned. In that case all further
1150	* steps regarding inserting of conflicting overlap range
1151	* have to be declined and actually considered as a bug.
1152	*/
1153	static __always_inline struct rb_node **
1154	find_va_links(struct vmap_area *va,
1155	struct rb_root root, struct* rb_node *from,
1156	struct rb_node **parent)
1157	{
1158	struct vmap_area *tmp_va;
1159	struct rb_node **link;
1160
1161	if (root) {
1162	link = &root->rb_node;
1163	if (unlikely(!*link)) {
1164	*parent = NULL;
1165	return link;
1166	}
1167	} else {
1168	link = &from;
1169	}
1170
1171	/*
1172	* Go to the bottom of the tree. When we hit the last point
1173	* we end up with parent rb_node and correct direction, i name
1174	* it link, where the new va->rb_node will be attached to.
1175	*/
1176	do {
1177	tmp_va = rb_entry(link, struct* vmap_area, rb_node);
1178
1179	/*
1180	* During the traversal we also do some sanity check.
1181	* Trigger the BUG() if there are sides(left/right)
1182	* or full overlaps.
1183	*/
1184	if (va->va_end <= tmp_va->va_start)
1185	link = &(*link)->rb_left;
1186	else if (va->va_start >= tmp_va->va_end)
1187	link = &(*link)->rb_right;
1188	else {
1189	WARN(`1`, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
1190	va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
1191
1192	return NULL;
1193	}
1194	} while (*link);
1195
1196	*parent = &tmp_va->rb_node;
1197	return link;
1198	}
1199
1200	static __always_inline struct list_head *
1201	get_va_next_sibling(struct rb_node parent, struct* rb_node **link)
1202	{
1203	struct list_head *list;
1204
1205	if (unlikely(!parent))
1206	/*
1207	* The red-black tree where we try to find VA neighbors
1208	* before merging or inserting is empty, i.e. it means
1209	* there is no free vmap space. Normally it does not
1210	* happen but we handle this case anyway.
1211	*/
1212	return NULL;
1213
1214	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
1215	return (&parent->rb_right == link ? list->next : list);
1216	}
1217
1218	static __always_inline void
1219	__link_va(struct vmap_area va, struct* rb_root *root,
1220	struct rb_node parent, struct* rb_node **link,
1221	struct list_head *head, bool augment)
1222	{
1223	/*
1224	* VA is still not in the list, but we can
1225	* identify its future previous list_head node.
1226	*/
1227	if (likely(parent)) {
1228	head = &rb_entry(parent, struct vmap_area, rb_node)->list;
1229	if (&parent->rb_right != link)
1230	head = head->prev;
1231	}
1232
1233	/ Insert to the rb-tree /
1234	rb_link_node(node: &va->rb_node, parent, rb_link: link);
1235	if (augment) {
1236	/*
1237	* Some explanation here. Just perform simple insertion
1238	* to the tree. We do not set va->subtree_max_size to
1239	* its current size before calling rb_insert_augmented().
1240	* It is because we populate the tree from the bottom
1241	* to parent levels when the node _is_ in the tree.
1242	*
1243	* Therefore we set subtree_max_size to zero after insertion,
1244	* to let __augment_tree_propagate_from() puts everything to
1245	* the correct order later on.
1246	*/
1247	rb_insert_augmented(node: &va->rb_node,
1248	root, augment: &free_vmap_area_rb_augment_cb);
1249	va->subtree_max_size = `0`;
1250	} else {
1251	rb_insert_color(&va->rb_node, root);
1252	}
1253
1254	/ Address-sort this list /
1255	list_add(new: &va->list, head);
1256	}
1257
1258	static __always_inline void
1259	link_va(struct vmap_area va, struct* rb_root *root,
1260	struct rb_node parent, struct* rb_node **link,
1261	struct list_head *head)
1262	{
1263	__link_va(va, root, parent, link, head, augment: false);
1264	}
1265
1266	static __always_inline void
1267	link_va_augment(struct vmap_area va, struct* rb_root *root,
1268	struct rb_node parent, struct* rb_node **link,
1269	struct list_head *head)
1270	{
1271	__link_va(va, root, parent, link, head, augment: true);
1272	}
1273
1274	static __always_inline void
1275	__unlink_va(struct vmap_area va, struct* rb_root *root, bool augment)
1276	{
1277	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
1278	return;
1279
1280	if (augment)
1281	rb_erase_augmented(node: &va->rb_node,
1282	root, augment: &free_vmap_area_rb_augment_cb);
1283	else
1284	rb_erase(&va->rb_node, root);
1285
1286	list_del_init(entry: &va->list);
1287	RB_CLEAR_NODE(&va->rb_node);
1288	}
1289
1290	static __always_inline void
1291	unlink_va(struct vmap_area va, struct* rb_root *root)
1292	{
1293	__unlink_va(va, root, augment: false);
1294	}
1295
1296	static __always_inline void
1297	unlink_va_augment(struct vmap_area va, struct* rb_root *root)
1298	{
1299	__unlink_va(va, root, augment: true);
1300	}
1301
1302	#if DEBUG_AUGMENT_PROPAGATE_CHECK
1303	/*
1304	* Gets called when remove the node and rotate.
1305	*/
1306	static __always_inline unsigned long
1307	compute_subtree_max_size(struct vmap_area *va)
1308	{
1309	return max3(va_size(va),
1310	get_subtree_max_size(va->rb_node.rb_left),
1311	get_subtree_max_size(va->rb_node.rb_right));
1312	}
1313
1314	static void
1315	augment_tree_propagate_check(void)
1316	{
1317	struct vmap_area *va;
1318	unsigned long computed_size;
1319
1320	list_for_each_entry(va, &free_vmap_area_list, list) {
1321	computed_size = compute_subtree_max_size(va);
1322	if (computed_size != va->subtree_max_size)
1323	pr_emerg("tree is corrupted: %lu, %lu\n",
1324	va_size(va), va->subtree_max_size);
1325	}
1326	}
1327	#endif
1328
1329	/*
1330	* This function populates subtree_max_size from bottom to upper
1331	* levels starting from VA point. The propagation must be done
1332	* when VA size is modified by changing its va_start/va_end. Or
1333	* in case of newly inserting of VA to the tree.
1334	*
1335	* It means that __augment_tree_propagate_from() must be called:
1336	* - After VA has been inserted to the tree(free path);
1337	* - After VA has been shrunk(allocation path);
1338	* - After VA has been increased(merging path).
1339	*
1340	* Please note that, it does not mean that upper parent nodes
1341	* and their subtree_max_size are recalculated all the time up
1342	* to the root node.
1343	*
1344	* 4--8
1345	* /\
1346	* / \
1347	* / \
1348	* 2--2 8--8
1349	*
1350	* For example if we modify the node 4, shrinking it to 2, then
1351	* no any modification is required. If we shrink the node 2 to 1
1352	* its subtree_max_size is updated only, and set to 1. If we shrink
1353	* the node 8 to 6, then its subtree_max_size is set to 6 and parent
1354	* node becomes 4--6.
1355	*/
1356	static __always_inline void
1357	augment_tree_propagate_from(struct vmap_area *va)
1358	{
1359	/*
1360	* Populate the tree from bottom towards the root until
1361	* the calculated maximum available size of checked node
1362	* is equal to its current one.
1363	*/
1364	free_vmap_area_rb_augment_cb_propagate(rb: &va->rb_node, NULL);
1365
1366	#if DEBUG_AUGMENT_PROPAGATE_CHECK
1367	augment_tree_propagate_check();
1368	#endif
1369	}
1370
1371	static void
1372	insert_vmap_area(struct vmap_area *va,
1373	struct rb_root root, struct* list_head *head)
1374	{
1375	struct rb_node **link;
1376	struct rb_node *parent;
1377
1378	link = find_va_links(va, root, NULL, parent: &parent);
1379	if (link)
1380	link_va(va, root, parent, link, head);
1381	}
1382
1383	static void
1384	insert_vmap_area_augment(struct vmap_area *va,
1385	struct rb_node from, struct* rb_root *root,
1386	struct list_head *head)
1387	{
1388	struct rb_node **link;
1389	struct rb_node *parent;
1390
1391	if (from)
1392	link = find_va_links(va, NULL, from, parent: &parent);
1393	else
1394	link = find_va_links(va, root, NULL, parent: &parent);
1395
1396	if (link) {
1397	link_va_augment(va, root, parent, link, head);
1398	augment_tree_propagate_from(va);
1399	}
1400	}
1401
1402	/*
1403	* Merge de-allocated chunk of VA memory with previous
1404	* and next free blocks. If coalesce is not done a new
1405	* free area is inserted. If VA has been merged, it is
1406	* freed.
1407	*
1408	* Please note, it can return NULL in case of overlap
1409	* ranges, followed by WARN() report. Despite it is a
1410	* buggy behaviour, a system can be alive and keep
1411	* ongoing.
1412	*/
1413	static __always_inline struct vmap_area *
1414	__merge_or_add_vmap_area(struct vmap_area *va,
1415	struct rb_root root, struct* list_head *head, bool augment)
1416	{
1417	struct vmap_area *sibling;
1418	struct list_head *next;
1419	struct rb_node **link;
1420	struct rb_node *parent;
1421	bool merged = false;
1422
1423	/*
1424	* Find a place in the tree where VA potentially will be
1425	* inserted, unless it is merged with its sibling/siblings.
1426	*/
1427	link = find_va_links(va, root, NULL, parent: &parent);
1428	if (!link)
1429	return NULL;
1430
1431	/*
1432	* Get next node of VA to check if merging can be done.
1433	*/
1434	next = get_va_next_sibling(parent, link);
1435	if (unlikely(next == NULL))
1436	goto insert;
1437
1438	/*
1439	* start end
1440	* \| \|
1441	* \|<------VA------>\|<-----Next----->\|
1442	* \| \|
1443	* start end
1444	*/
1445	if (next != head) {
1446	sibling = list_entry(next, struct vmap_area, list);
1447	if (sibling->va_start == va->va_end) {
1448	sibling->va_start = va->va_start;
1449
1450	/ Free vmap_area object. /
1451	kmem_cache_free(s: vmap_area_cachep, objp: va);
1452
1453	/ Point to the new merged area. /
1454	va = sibling;
1455	merged = true;
1456	}
1457	}
1458
1459	/*
1460	* start end
1461	* \| \|
1462	* \|<-----Prev----->\|<------VA------>\|
1463	* \| \|
1464	* start end
1465	*/
1466	if (next->prev != head) {
1467	sibling = list_entry(next->prev, struct vmap_area, list);
1468	if (sibling->va_end == va->va_start) {
1469	/*
1470	* If both neighbors are coalesced, it is important
1471	* to unlink the "next" node first, followed by merging
1472	* with "previous" one. Otherwise the tree might not be
1473	* fully populated if a sibling's augmented value is
1474	* "normalized" because of rotation operations.
1475	*/
1476	if (merged)
1477	__unlink_va(va, root, augment);
1478
1479	sibling->va_end = va->va_end;
1480
1481	/ Free vmap_area object. /
1482	kmem_cache_free(s: vmap_area_cachep, objp: va);
1483
1484	/ Point to the new merged area. /
1485	va = sibling;
1486	merged = true;
1487	}
1488	}
1489
1490	insert:
1491	if (!merged)
1492	__link_va(va, root, parent, link, head, augment);
1493
1494	return va;
1495	}
1496
1497	static __always_inline struct vmap_area *
1498	merge_or_add_vmap_area(struct vmap_area *va,
1499	struct rb_root root, struct* list_head *head)
1500	{
1501	return __merge_or_add_vmap_area(va, root, head, augment: false);
1502	}
1503
1504	static __always_inline struct vmap_area *
1505	merge_or_add_vmap_area_augment(struct vmap_area *va,
1506	struct rb_root root, struct* list_head *head)
1507	{
1508	va = __merge_or_add_vmap_area(va, root, head, augment: true);
1509	if (va)
1510	augment_tree_propagate_from(va);
1511
1512	return va;
1513	}
1514
1515	static __always_inline bool
1516	is_within_this_va(struct vmap_area va, unsigned* long size,
1517	unsigned long align, unsigned long vstart)
1518	{
1519	unsigned long nva_start_addr;
1520
1521	if (va->va_start > vstart)
1522	nva_start_addr = ALIGN(va->va_start, align);
1523	else
1524	nva_start_addr = ALIGN(vstart, align);
1525
1526	/ Can be overflowed due to big size or alignment. /
1527	if (nva_start_addr + size < nva_start_addr \|\|
1528	nva_start_addr < vstart)
1529	return false;
1530
1531	return (nva_start_addr + size <= va->va_end);
1532	}
1533
1534	/*
1535	* Find the first free block(lowest start address) in the tree,
1536	* that will accomplish the request corresponding to passing
1537	* parameters. Please note, with an alignment bigger than PAGE_SIZE,
1538	* a search length is adjusted to account for worst case alignment
1539	* overhead.
1540	*/
1541	static __always_inline struct vmap_area *
1542	find_vmap_lowest_match(struct rb_root root, unsigned* long size,
1543	unsigned long align, unsigned long vstart, bool adjust_search_size)
1544	{
1545	struct vmap_area *va;
1546	struct rb_node *node;
1547	unsigned long length;
1548
1549	/ Start from the root. /
1550	node = root->rb_node;
1551
1552	/ Adjust the search size for alignment overhead. /
1553	length = adjust_search_size ? size + align - `1` : size;
1554
1555	while (node) {
1556	va = rb_entry(node, struct vmap_area, rb_node);
1557
1558	if (get_subtree_max_size(node: node->rb_left) >= length &&
1559	vstart < va->va_start) {
1560	node = node->rb_left;
1561	} else {
1562	if (is_within_this_va(va, size, align, vstart))
1563	return va;
1564
1565	/*
1566	* Does not make sense to go deeper towards the right
1567	* sub-tree if it does not have a free block that is
1568	* equal or bigger to the requested search length.
1569	*/
1570	if (get_subtree_max_size(node: node->rb_right) >= length) {
1571	node = node->rb_right;
1572	continue;
1573	}
1574
1575	/*
1576	* OK. We roll back and find the first right sub-tree,
1577	* that will satisfy the search criteria. It can happen
1578	* due to "vstart" restriction or an alignment overhead
1579	* that is bigger then PAGE_SIZE.
1580	*/
1581	while ((node = rb_parent(node))) {
1582	va = rb_entry(node, struct vmap_area, rb_node);
1583	if (is_within_this_va(va, size, align, vstart))
1584	return va;
1585
1586	if (get_subtree_max_size(node: node->rb_right) >= length &&
1587	vstart <= va->va_start) {
1588	/*
1589	* Shift the vstart forward. Please note, we update it with
1590	* parent's start address adding "1" because we do not want
1591	* to enter same sub-tree after it has already been checked
1592	* and no suitable free block found there.
1593	*/
1594	vstart = va->va_start + `1`;
1595	node = node->rb_right;
1596	break;
1597	}
1598	}
1599	}
1600	}
1601
1602	return NULL;
1603	}
1604
1605	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1606	#include <linux/random.h>
1607
1608	static struct vmap_area *
1609	find_vmap_lowest_linear_match(struct list_head head, unsigned* long size,
1610	unsigned long align, unsigned long vstart)
1611	{
1612	struct vmap_area *va;
1613
1614	list_for_each_entry(va, head, list) {
1615	if (!is_within_this_va(va, size, align, vstart))
1616	continue;
1617
1618	return va;
1619	}
1620
1621	return NULL;
1622	}
1623
1624	static void
1625	find_vmap_lowest_match_check(struct rb_root root, struct* list_head *head,
1626	unsigned long size, unsigned long align)
1627	{
1628	struct vmap_area va_1, va_2;
1629	unsigned long vstart;
1630	unsigned int rnd;
1631
1632	get_random_bytes(&rnd, sizeof(rnd));
1633	vstart = VMALLOC_START + rnd;
1634
1635	va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
1636	va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
1637
1638	if (va_1 != va_2)
1639	pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
1640	va_1, va_2, vstart);
1641	}
1642	#endif
1643
1644	enum fit_type {
1645	NOTHING_FIT = `0`,
1646	FL_FIT_TYPE = `1`, / full fit /
1647	LE_FIT_TYPE = `2`, / left edge fit /
1648	RE_FIT_TYPE = `3`, / right edge fit /
1649	NE_FIT_TYPE = `4` / no edge fit /
1650	};
1651
1652	static __always_inline enum fit_type
1653	classify_va_fit_type(struct vmap_area *va,
1654	unsigned long nva_start_addr, unsigned long size)
1655	{
1656	enum fit_type type;
1657
1658	/ Check if it is within VA. /
1659	if (nva_start_addr < va->va_start \|\|
1660	nva_start_addr + size > va->va_end)
1661	return NOTHING_FIT;
1662
1663	/ Now classify. /
1664	if (va->va_start == nva_start_addr) {
1665	if (va->va_end == nva_start_addr + size)
1666	type = FL_FIT_TYPE;
1667	else
1668	type = LE_FIT_TYPE;
1669	} else if (va->va_end == nva_start_addr + size) {
1670	type = RE_FIT_TYPE;
1671	} else {
1672	type = NE_FIT_TYPE;
1673	}
1674
1675	return type;
1676	}
1677
1678	static __always_inline int
1679	va_clip(struct rb_root root, struct* list_head *head,
1680	struct vmap_area va, unsigned* long nva_start_addr,
1681	unsigned long size)
1682	{
1683	struct vmap_area *lva = NULL;
1684	enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
1685
1686	if (type == FL_FIT_TYPE) {
1687	/*
1688	* No need to split VA, it fully fits.
1689	*
1690	* \| \|
1691	* V NVA V
1692	* \|---------------\|
1693	*/
1694	unlink_va_augment(va, root);
1695	kmem_cache_free(s: vmap_area_cachep, objp: va);
1696	} else if (type == LE_FIT_TYPE) {
1697	/*
1698	* Split left edge of fit VA.
1699	*
1700	* \| \|
1701	* V NVA V R
1702	* \|-------\|-------\|
1703	*/
1704	va->va_start += size;
1705	} else if (type == RE_FIT_TYPE) {
1706	/*
1707	* Split right edge of fit VA.
1708	*
1709	* \| \|
1710	* L V NVA V
1711	* \|-------\|-------\|
1712	*/
1713	va->va_end = nva_start_addr;
1714	} else if (type == NE_FIT_TYPE) {
1715	/*
1716	* Split no edge of fit VA.
1717	*
1718	* \| \|
1719	* L V NVA V R
1720	* \|---\|-------\|---\|
1721	*/
1722	lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1723	if (unlikely(!lva)) {
1724	/*
1725	* For percpu allocator we do not do any pre-allocation
1726	* and leave it as it is. The reason is it most likely
1727	* never ends up with NE_FIT_TYPE splitting. In case of
1728	* percpu allocations offsets and sizes are aligned to
1729	* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1730	* are its main fitting cases.
1731	*
1732	* There are a few exceptions though, as an example it is
1733	* a first allocation (early boot up) when we have "one"
1734	* big free space that has to be split.
1735	*
1736	* Also we can hit this path in case of regular "vmap"
1737	* allocations, if "this" current CPU was not preloaded.
1738	* See the comment in alloc_vmap_area() why. If so, then
1739	* GFP_NOWAIT is used instead to get an extra object for
1740	* split purpose. That is rare and most time does not
1741	* occur.
1742	*
1743	* What happens if an allocation gets failed. Basically,
1744	* an "overflow" path is triggered to purge lazily freed
1745	* areas to free some memory, then, the "retry" path is
1746	* triggered to repeat one more time. See more details
1747	* in alloc_vmap_area() function.
1748	*/
1749	lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1750	if (!lva)
1751	return -ENOMEM;
1752	}
1753
1754	/*
1755	* Build the remainder.
1756	*/
1757	lva->va_start = va->va_start;
1758	lva->va_end = nva_start_addr;
1759
1760	/*
1761	* Shrink this VA to remaining size.
1762	*/
1763	va->va_start = nva_start_addr + size;
1764	} else {
1765	return -EINVAL;
1766	}
1767
1768	if (type != FL_FIT_TYPE) {
1769	augment_tree_propagate_from(va);
1770
1771	if (lva) / type == NE_FIT_TYPE /
1772	insert_vmap_area_augment(va: lva, from: &va->rb_node, root, head);
1773	}
1774
1775	return `0`;
1776	}
1777
1778	static unsigned long
1779	va_alloc(struct vmap_area *va,
1780	struct rb_root root, struct* list_head *head,
1781	unsigned long size, unsigned long align,
1782	unsigned long vstart, unsigned long vend)
1783	{
1784	unsigned long nva_start_addr;
1785	int ret;
1786
1787	if (va->va_start > vstart)
1788	nva_start_addr = ALIGN(va->va_start, align);
1789	else
1790	nva_start_addr = ALIGN(vstart, align);
1791
1792	/ Check the "vend" restriction. /
1793	if (nva_start_addr + size > vend)
1794	return -ERANGE;
1795
1796	/ Update the free vmap_area. /
1797	ret = va_clip(root, head, va, nva_start_addr, size);
1798	if (WARN_ON_ONCE(ret))
1799	return ret;
1800
1801	return nva_start_addr;
1802	}
1803
1804	/*
1805	* Returns a start address of the newly allocated area, if success.
1806	* Otherwise an error value is returned that indicates failure.
1807	*/
1808	static __always_inline unsigned long
1809	__alloc_vmap_area(struct rb_root root, struct* list_head *head,
1810	unsigned long size, unsigned long align,
1811	unsigned long vstart, unsigned long vend)
1812	{
1813	bool adjust_search_size = true;
1814	unsigned long nva_start_addr;
1815	struct vmap_area *va;
1816
1817	/*
1818	* Do not adjust when:
1819	* a) align <= PAGE_SIZE, because it does not make any sense.
1820	* All blocks(their start addresses) are at least PAGE_SIZE
1821	* aligned anyway;
1822	* b) a short range where a requested size corresponds to exactly
1823	* specified [vstart:vend] interval and an alignment > PAGE_SIZE.
1824	* With adjusted search length an allocation would not succeed.
1825	*/
1826	if (align <= PAGE_SIZE \|\| (align > PAGE_SIZE && (vend - vstart) == size))
1827	adjust_search_size = false;
1828
1829	va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
1830	if (unlikely(!va))
1831	return -ENOENT;
1832
1833	nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
1834
1835	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1836	if (!IS_ERR_VALUE(nva_start_addr))
1837	find_vmap_lowest_match_check(root, head, size, align);
1838	#endif
1839
1840	return nva_start_addr;
1841	}
1842
1843	/*
1844	* Free a region of KVA allocated by alloc_vmap_area
1845	*/
1846	static void free_vmap_area(struct vmap_area *va)
1847	{
1848	struct vmap_node *vn = addr_to_node(addr: va->va_start);
1849
1850	/*
1851	* Remove from the busy tree/list.
1852	*/
1853	spin_lock(lock: &vn->busy.lock);
1854	unlink_va(va, root: &vn->busy.root);
1855	spin_unlock(lock: &vn->busy.lock);
1856
1857	/*
1858	* Insert/Merge it back to the free tree/list.
1859	*/
1860	spin_lock(lock: &free_vmap_area_lock);
1861	merge_or_add_vmap_area_augment(va, root: &free_vmap_area_root, head: &free_vmap_area_list);
1862	spin_unlock(lock: &free_vmap_area_lock);
1863	}
1864
1865	static inline void
1866	preload_this_cpu_lock(spinlock_t lock, gfp_t gfp_mask, int* node)
1867	{
1868	struct vmap_area va = NULL, tmp;
1869
1870	/*
1871	* Preload this CPU with one extra vmap_area object. It is used
1872	* when fit type of free area is NE_FIT_TYPE. It guarantees that
1873	* a CPU that does an allocation is preloaded.
1874	*
1875	* We do it in non-atomic context, thus it allows us to use more
1876	* permissive allocation masks to be more stable under low memory
1877	* condition and high memory pressure.
1878	*/
1879	if (!this_cpu_read(ne_fit_preload_node))
1880	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1881
1882	spin_lock(lock);
1883
1884	tmp = NULL;
1885	if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
1886	kmem_cache_free(s: vmap_area_cachep, objp: va);
1887	}
1888
1889	static struct vmap_pool *
1890	size_to_va_pool(struct vmap_node vn, unsigned* long size)
1891	{
1892	unsigned int idx = (size - `1`) / PAGE_SIZE;
1893
1894	if (idx < MAX_VA_SIZE_PAGES)
1895	return &vn->pool[idx];
1896
1897	return NULL;
1898	}
1899
1900	static bool
1901	node_pool_add_va(struct vmap_node n, struct* vmap_area *va)
1902	{
1903	struct vmap_pool *vp;
1904
1905	vp = size_to_va_pool(vn: n, size: va_size(va));
1906	if (!vp)
1907	return false;
1908
1909	spin_lock(lock: &n->pool_lock);
1910	list_add(new: &va->list, head: &vp->head);
1911	WRITE_ONCE(vp->len, vp->len + `1`);
1912	spin_unlock(lock: &n->pool_lock);
1913
1914	return true;
1915	}
1916
1917	static struct vmap_area *
1918	node_pool_del_va(struct vmap_node vn, unsigned* long size,
1919	unsigned long align, unsigned long vstart,
1920	unsigned long vend)
1921	{
1922	struct vmap_area *va = NULL;
1923	struct vmap_pool *vp;
1924	int err = `0`;
1925
1926	vp = size_to_va_pool(vn, size);
1927	if (!vp \|\| list_empty(head: &vp->head))
1928	return NULL;
1929
1930	spin_lock(lock: &vn->pool_lock);
1931	if (!list_empty(head: &vp->head)) {
1932	va = list_first_entry(&vp->head, struct vmap_area, list);
1933
1934	if (IS_ALIGNED(va->va_start, align)) {
1935	/*
1936	* Do some sanity check and emit a warning
1937	* if one of below checks detects an error.
1938	*/
1939	err \|= (va_size(va) != size);
1940	err \|= (va->va_start < vstart);
1941	err \|= (va->va_end > vend);
1942
1943	if (!WARN_ON_ONCE(err)) {
1944	list_del_init(entry: &va->list);
1945	WRITE_ONCE(vp->len, vp->len - `1`);
1946	} else {
1947	va = NULL;
1948	}
1949	} else {
1950	list_move_tail(list: &va->list, head: &vp->head);
1951	va = NULL;
1952	}
1953	}
1954	spin_unlock(lock: &vn->pool_lock);
1955
1956	return va;
1957	}
1958
1959	static struct vmap_area *
1960	node_alloc(unsigned long size, unsigned long align,
1961	unsigned long vstart, unsigned long vend,
1962	unsigned long addr, unsigned* int *vn_id)
1963	{
1964	struct vmap_area *va;
1965
1966	*vn_id = `0`;
1967	*addr = -EINVAL;
1968
1969	/*
1970	* Fallback to a global heap if not vmalloc or there
1971	* is only one node.
1972	*/
1973	if (vstart != VMALLOC_START \|\| vend != VMALLOC_END \|\|
1974	nr_vmap_nodes == `1`)
1975	return NULL;
1976
1977	*vn_id = raw_smp_processor_id() % nr_vmap_nodes;
1978	va = node_pool_del_va(vn: id_to_node(id: *vn_id), size, align, vstart, vend);
1979	vn_id = encode_vn_id(node_id: vn_id);
1980
1981	if (va)
1982	*addr = va->va_start;
1983
1984	return va;
1985	}
1986
1987	static inline void setup_vmalloc_vm(struct vm_struct *vm,
1988	struct vmap_area va, unsigned* long flags, const void *caller)
1989	{
1990	vm->flags = flags;
1991	vm->addr = (void *)va->va_start;
1992	vm->size = vm->requested_size = va_size(va);
1993	vm->caller = caller;
1994	va->vm = vm;
1995	}
1996
1997	/*
1998	* Allocate a region of KVA of the specified size and alignment, within the
1999	* vstart and vend. If vm is passed in, the two will also be bound.
2000	*/
2001	static struct vmap_area alloc_vmap_area(unsigned* long size,
2002	unsigned long align,
2003	unsigned long vstart, unsigned long vend,
2004	int node, gfp_t gfp_mask,
2005	unsigned long va_flags, struct vm_struct *vm)
2006	{
2007	struct vmap_node *vn;
2008	struct vmap_area *va;
2009	unsigned long freed;
2010	unsigned long addr;
2011	unsigned int vn_id;
2012	int purged = `0`;
2013	int ret;
2014
2015	if (unlikely(!size \|\| offset_in_page(size) \|\| !is_power_of_2(align)))
2016	return ERR_PTR(error: -EINVAL);
2017
2018	if (unlikely(!vmap_initialized))
2019	return ERR_PTR(error: -EBUSY);
2020
2021	might_sleep();
2022
2023	/*
2024	* If a VA is obtained from a global heap(if it fails here)
2025	* it is anyway marked with this "vn_id" so it is returned
2026	* to this pool's node later. Such way gives a possibility
2027	* to populate pools based on users demand.
2028	*
2029	* On success a ready to go VA is returned.
2030	*/
2031	va = node_alloc(size, align, vstart, vend, addr: &addr, vn_id: &vn_id);
2032	if (!va) {
2033	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
2034
2035	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
2036	if (unlikely(!va))
2037	return ERR_PTR(error: -ENOMEM);
2038
2039	/*
2040	* Only scan the relevant parts containing pointers to other objects
2041	* to avoid false negatives.
2042	*/
2043	kmemleak_scan_area(ptr: &va->rb_node, SIZE_MAX, gfp: gfp_mask);
2044	}
2045
2046	retry:
2047	if (IS_ERR_VALUE(addr)) {
2048	preload_this_cpu_lock(lock: &free_vmap_area_lock, gfp_mask, node);
2049	addr = __alloc_vmap_area(root: &free_vmap_area_root, head: &free_vmap_area_list,
2050	size, align, vstart, vend);
2051	spin_unlock(lock: &free_vmap_area_lock);
2052	}
2053
2054	trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
2055
2056	/*
2057	* If an allocation fails, the error value is
2058	* returned. Therefore trigger the overflow path.
2059	*/
2060	if (IS_ERR_VALUE(addr))
2061	goto overflow;
2062
2063	va->va_start = addr;
2064	va->va_end = addr + size;
2065	va->vm = NULL;
2066	va->flags = (va_flags \| vn_id);
2067
2068	if (vm) {
2069	vm->addr = (void *)va->va_start;
2070	vm->size = va_size(va);
2071	va->vm = vm;
2072	}
2073
2074	vn = addr_to_node(addr: va->va_start);
2075
2076	spin_lock(lock: &vn->busy.lock);
2077	insert_vmap_area(va, root: &vn->busy.root, head: &vn->busy.head);
2078	spin_unlock(lock: &vn->busy.lock);
2079
2080	BUG_ON(!IS_ALIGNED(va->va_start, align));
2081	BUG_ON(va->va_start < vstart);
2082	BUG_ON(va->va_end > vend);
2083
2084	ret = kasan_populate_vmalloc(addr, size);
2085	if (ret) {
2086	free_vmap_area(va);
2087	return ERR_PTR(error: ret);
2088	}
2089
2090	return va;
2091
2092	overflow:
2093	if (!purged) {
2094	reclaim_and_purge_vmap_areas();
2095	purged = `1`;
2096	goto retry;
2097	}
2098
2099	freed = `0`;
2100	blocking_notifier_call_chain(nh: &vmap_notify_list, val: `0`, v: &freed);
2101
2102	if (freed > `0`) {
2103	purged = `0`;
2104	goto retry;
2105	}
2106
2107	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
2108	pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
2109	size, vstart, vend);
2110
2111	kmem_cache_free(s: vmap_area_cachep, objp: va);
2112	return ERR_PTR(error: -EBUSY);
2113	}
2114
2115	int register_vmap_purge_notifier(struct notifier_block *nb)
2116	{
2117	return blocking_notifier_chain_register(nh: &vmap_notify_list, nb);
2118	}
2119	EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
2120
2121	int unregister_vmap_purge_notifier(struct notifier_block *nb)
2122	{
2123	return blocking_notifier_chain_unregister(nh: &vmap_notify_list, nb);
2124	}
2125	EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
2126
2127	/*
2128	* lazy_max_pages is the maximum amount of virtual address space we gather up
2129	* before attempting to purge with a TLB flush.
2130	*
2131	* There is a tradeoff here: a larger number will cover more kernel page tables
2132	* and take slightly longer to purge, but it will linearly reduce the number of
2133	* global TLB flushes that must be performed. It would seem natural to scale
2134	* this number up linearly with the number of CPUs (because vmapping activity
2135	* could also scale linearly with the number of CPUs), however it is likely
2136	* that in practice, workloads might be constrained in other ways that mean
2137	* vmap activity will not scale linearly with CPUs. Also, I want to be
2138	* conservative and not introduce a big latency on huge systems, so go with
2139	* a less aggressive log scale. It will still be an improvement over the old
2140	* code, and it will be simple to change the scale factor if we find that it
2141	* becomes a problem on bigger systems.
2142	*/
2143	static unsigned long lazy_max_pages(void)
2144	{
2145	unsigned int log;
2146
2147	log = fls(x: num_online_cpus());
2148
2149	return log * (`32UL` * `1024` * `1024` / PAGE_SIZE);
2150	}
2151
2152	/*
2153	* Serialize vmap purging. There is no actual critical section protected
2154	* by this lock, but we want to avoid concurrent calls for performance
2155	* reasons and to make the pcpu_get_vm_areas more deterministic.
2156	*/
2157	static DEFINE_MUTEX(vmap_purge_lock);
2158
2159	/ for per-CPU blocks /
2160	static void purge_fragmented_blocks_allcpus(void);
2161
2162	static void
2163	reclaim_list_global(struct list_head *head)
2164	{
2165	struct vmap_area va, n;
2166
2167	if (list_empty(head))
2168	return;
2169
2170	spin_lock(lock: &free_vmap_area_lock);
2171	list_for_each_entry_safe(va, n, head, list)
2172	merge_or_add_vmap_area_augment(va,
2173	root: &free_vmap_area_root, head: &free_vmap_area_list);
2174	spin_unlock(lock: &free_vmap_area_lock);
2175	}
2176
2177	static void
2178	decay_va_pool_node(struct vmap_node *vn, bool full_decay)
2179	{
2180	LIST_HEAD(decay_list);
2181	struct rb_root decay_root = RB_ROOT;
2182	struct vmap_area va, nva;
2183	unsigned long n_decay, pool_len;
2184	int i;
2185
2186	for (i = `0`; i < MAX_VA_SIZE_PAGES; i++) {
2187	LIST_HEAD(tmp_list);
2188
2189	if (list_empty(head: &vn->pool[i].head))
2190	continue;
2191
2192	/ Detach the pool, so no-one can access it. /
2193	spin_lock(lock: &vn->pool_lock);
2194	list_replace_init(old: &vn->pool[i].head, new: &tmp_list);
2195	spin_unlock(lock: &vn->pool_lock);
2196
2197	pool_len = n_decay = vn->pool[i].len;
2198	WRITE_ONCE(vn->pool[i].len, `0`);
2199
2200	/ Decay a pool by ~25% out of left objects. /
2201	if (!full_decay)
2202	n_decay >>= `2`;
2203	pool_len -= n_decay;
2204
2205	list_for_each_entry_safe(va, nva, &tmp_list, list) {
2206	if (!n_decay--)
2207	break;
2208
2209	list_del_init(entry: &va->list);
2210	merge_or_add_vmap_area(va, root: &decay_root, head: &decay_list);
2211	}
2212
2213	/*
2214	* Attach the pool back if it has been partly decayed.
2215	* Please note, it is supposed that nobody(other contexts)
2216	* can populate the pool therefore a simple list replace
2217	* operation takes place here.
2218	*/
2219	if (!list_empty(head: &tmp_list)) {
2220	spin_lock(lock: &vn->pool_lock);
2221	list_replace_init(old: &tmp_list, new: &vn->pool[i].head);
2222	WRITE_ONCE(vn->pool[i].len, pool_len);
2223	spin_unlock(lock: &vn->pool_lock);
2224	}
2225	}
2226
2227	reclaim_list_global(head: &decay_list);
2228	}
2229
2230	static void
2231	kasan_release_vmalloc_node(struct vmap_node *vn)
2232	{
2233	struct vmap_area *va;
2234	unsigned long start, end;
2235
2236	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
2237	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
2238
2239	list_for_each_entry(va, &vn->purge_list, list) {
2240	if (is_vmalloc_or_module_addr((void *) va->va_start))
2241	kasan_release_vmalloc(start: va->va_start, end: va->va_end,
2242	free_region_start: va->va_start, free_region_end: va->va_end,
2243	KASAN_VMALLOC_PAGE_RANGE);
2244	}
2245
2246	kasan_release_vmalloc(start, end, free_region_start: start, free_region_end: end, KASAN_VMALLOC_TLB_FLUSH);
2247	}
2248
2249	static void purge_vmap_node(struct work_struct *work)
2250	{
2251	struct vmap_node *vn = container_of(work,
2252	struct vmap_node, purge_work);
2253	unsigned long nr_purged_pages = `0`;
2254	struct vmap_area va, n_va;
2255	LIST_HEAD(local_list);
2256
2257	if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
2258	kasan_release_vmalloc_node(vn);
2259
2260	vn->nr_purged = `0`;
2261
2262	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
2263	unsigned long nr = va_size(va) >> PAGE_SHIFT;
2264	unsigned int vn_id = decode_vn_id(val: va->flags);
2265
2266	list_del_init(entry: &va->list);
2267
2268	nr_purged_pages += nr;
2269	vn->nr_purged++;
2270
2271	if (is_vn_id_valid(node_id: vn_id) && !vn->skip_populate)
2272	if (node_pool_add_va(n: vn, va))
2273	continue;
2274
2275	/ Go back to global. /
2276	list_add(new: &va->list, head: &local_list);
2277	}
2278
2279	atomic_long_sub(i: nr_purged_pages, v: &vmap_lazy_nr);
2280
2281	reclaim_list_global(head: &local_list);
2282	}
2283
2284	/*
2285	* Purges all lazily-freed vmap areas.
2286	*/
2287	static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
2288	bool full_pool_decay)
2289	{
2290	unsigned long nr_purged_areas = `0`;
2291	unsigned int nr_purge_helpers;
2292	static cpumask_t purge_nodes;
2293	unsigned int nr_purge_nodes;
2294	struct vmap_node *vn;
2295	int i;
2296
2297	lockdep_assert_held(&vmap_purge_lock);
2298
2299	/*
2300	* Use cpumask to mark which node has to be processed.
2301	*/
2302	purge_nodes = CPU_MASK_NONE;
2303
2304	for_each_vmap_node(vn) {
2305	INIT_LIST_HEAD(list: &vn->purge_list);
2306	vn->skip_populate = full_pool_decay;
2307	decay_va_pool_node(vn, full_decay: full_pool_decay);
2308
2309	if (RB_EMPTY_ROOT(&vn->lazy.root))
2310	continue;
2311
2312	spin_lock(lock: &vn->lazy.lock);
2313	WRITE_ONCE(vn->lazy.root.rb_node, NULL);
2314	list_replace_init(old: &vn->lazy.head, new: &vn->purge_list);
2315	spin_unlock(lock: &vn->lazy.lock);
2316
2317	start = min(start, list_first_entry(&vn->purge_list,
2318	struct vmap_area, list)->va_start);
2319
2320	end = max(end, list_last_entry(&vn->purge_list,
2321	struct vmap_area, list)->va_end);
2322
2323	cpumask_set_cpu(cpu: node_to_id(node: vn), dstp: &purge_nodes);
2324	}
2325
2326	nr_purge_nodes = cpumask_weight(srcp: &purge_nodes);
2327	if (nr_purge_nodes > `0`) {
2328	flush_tlb_kernel_range(start, end);
2329
2330	/ One extra worker is per a lazy_max_pages() full set minus one. /
2331	nr_purge_helpers = atomic_long_read(v: &vmap_lazy_nr) / lazy_max_pages();
2332	nr_purge_helpers = clamp(nr_purge_helpers, `1U`, nr_purge_nodes) - `1`;
2333
2334	for_each_cpu(i, &purge_nodes) {
2335	vn = &vmap_nodes[i];
2336
2337	if (nr_purge_helpers > `0`) {
2338	INIT_WORK(&vn->purge_work, purge_vmap_node);
2339
2340	if (cpumask_test_cpu(cpu: i, cpu_online_mask))
2341	schedule_work_on(cpu: i, work: &vn->purge_work);
2342	else
2343	schedule_work(work: &vn->purge_work);
2344
2345	nr_purge_helpers--;
2346	} else {
2347	vn->purge_work.func = NULL;
2348	purge_vmap_node(work: &vn->purge_work);
2349	nr_purged_areas += vn->nr_purged;
2350	}
2351	}
2352
2353	for_each_cpu(i, &purge_nodes) {
2354	vn = &vmap_nodes[i];
2355
2356	if (vn->purge_work.func) {
2357	flush_work(work: &vn->purge_work);
2358	nr_purged_areas += vn->nr_purged;
2359	}
2360	}
2361	}
2362
2363	trace_purge_vmap_area_lazy(start, end, npurged: nr_purged_areas);
2364	return nr_purged_areas > `0`;
2365	}
2366
2367	/*
2368	* Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
2369	*/
2370	static void reclaim_and_purge_vmap_areas(void)
2371
2372	{
2373	mutex_lock(&vmap_purge_lock);
2374	purge_fragmented_blocks_allcpus();
2375	__purge_vmap_area_lazy(ULONG_MAX, end: `0`, full_pool_decay: true);
2376	mutex_unlock(lock: &vmap_purge_lock);
2377	}
2378
2379	static void drain_vmap_area_work(struct work_struct *work)
2380	{
2381	mutex_lock(&vmap_purge_lock);
2382	__purge_vmap_area_lazy(ULONG_MAX, end: `0`, full_pool_decay: false);
2383	mutex_unlock(lock: &vmap_purge_lock);
2384	}
2385
2386	/*
2387	* Free a vmap area, caller ensuring that the area has been unmapped,
2388	* unlinked and flush_cache_vunmap had been called for the correct
2389	* range previously.
2390	*/
2391	static void free_vmap_area_noflush(struct vmap_area *va)
2392	{
2393	unsigned long nr_lazy_max = lazy_max_pages();
2394	unsigned long va_start = va->va_start;
2395	unsigned int vn_id = decode_vn_id(val: va->flags);
2396	struct vmap_node *vn;
2397	unsigned long nr_lazy;
2398
2399	if (WARN_ON_ONCE(!list_empty(&va->list)))
2400	return;
2401
2402	nr_lazy = atomic_long_add_return_relaxed(i: va_size(va) >> PAGE_SHIFT,
2403	v: &vmap_lazy_nr);
2404
2405	/*
2406	* If it was request by a certain node we would like to
2407	* return it to that node, i.e. its pool for later reuse.
2408	*/
2409	vn = is_vn_id_valid(node_id: vn_id) ?
2410	id_to_node(id: vn_id):addr_to_node(addr: va->va_start);
2411
2412	spin_lock(lock: &vn->lazy.lock);
2413	insert_vmap_area(va, root: &vn->lazy.root, head: &vn->lazy.head);
2414	spin_unlock(lock: &vn->lazy.lock);
2415
2416	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
2417
2418	/ After this point, we may free va at any time /
2419	if (unlikely(nr_lazy > nr_lazy_max))
2420	schedule_work(work: &drain_vmap_work);
2421	}
2422
2423	/*
2424	* Free and unmap a vmap area
2425	*/
2426	static void free_unmap_vmap_area(struct vmap_area *va)
2427	{
2428	flush_cache_vunmap(start: va->va_start, end: va->va_end);
2429	vunmap_range_noflush(start: va->va_start, end: va->va_end);
2430	if (debug_pagealloc_enabled_static())
2431	flush_tlb_kernel_range(start: va->va_start, end: va->va_end);
2432
2433	free_vmap_area_noflush(va);
2434	}
2435
2436	struct vmap_area find_vmap_area(unsigned* long addr)
2437	{
2438	struct vmap_node *vn;
2439	struct vmap_area *va;
2440	int i, j;
2441
2442	if (unlikely(!vmap_initialized))
2443	return NULL;
2444
2445	/*
2446	* An addr_to_node_id(addr) converts an address to a node index
2447	* where a VA is located. If VA spans several zones and passed
2448	* addr is not the same as va->va_start, what is not common, we
2449	* may need to scan extra nodes. See an example:
2450	*
2451	* <----va---->
2452	* -\|-----\|-----\|-----\|-----\|-
2453	* 1 2 0 1
2454	*
2455	* VA resides in node 1 whereas it spans 1, 2 an 0. If passed
2456	* addr is within 2 or 0 nodes we should do extra work.
2457	*/
2458	i = j = addr_to_node_id(addr);
2459	do {
2460	vn = &vmap_nodes[i];
2461
2462	spin_lock(lock: &vn->busy.lock);
2463	va = __find_vmap_area(addr, root: &vn->busy.root);
2464	spin_unlock(lock: &vn->busy.lock);
2465
2466	if (va)
2467	return va;
2468	} while ((i = (i + nr_vmap_nodes - `1`) % nr_vmap_nodes) != j);
2469
2470	return NULL;
2471	}
2472
2473	static struct vmap_area find_unlink_vmap_area(unsigned* long addr)
2474	{
2475	struct vmap_node *vn;
2476	struct vmap_area *va;
2477	int i, j;
2478
2479	/*
2480	* Check the comment in the find_vmap_area() about the loop.
2481	*/
2482	i = j = addr_to_node_id(addr);
2483	do {
2484	vn = &vmap_nodes[i];
2485
2486	spin_lock(lock: &vn->busy.lock);
2487	va = __find_vmap_area(addr, root: &vn->busy.root);
2488	if (va)
2489	unlink_va(va, root: &vn->busy.root);
2490	spin_unlock(lock: &vn->busy.lock);
2491
2492	if (va)
2493	return va;
2494	} while ((i = (i + nr_vmap_nodes - `1`) % nr_vmap_nodes) != j);
2495
2496	return NULL;
2497	}
2498
2499	/ Per cpu kva allocator /
2500
2501	/*
2502	* vmap space is limited especially on 32 bit architectures. Ensure there is
2503	* room for at least 16 percpu vmap blocks per CPU.
2504	*/
2505	/*
2506	* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
2507	* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
2508	* instead (we just need a rough idea)
2509	*/
2510	#if BITS_PER_LONG == 32
2511	#define VMALLOC_SPACE (128UL10241024)
2512	#else
2513	#define VMALLOC_SPACE (128UL10241024*1024)
2514	#endif
2515
2516	#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
2517	#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
2518	#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
2519	#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
2520	#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
2521	#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
2522	#define VMAP_BBMAP_BITS \
2523	VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
2524	VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
2525	VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
2526
2527	#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
2528
2529	/*
2530	* Purge threshold to prevent overeager purging of fragmented blocks for
2531	* regular operations: Purge if vb->free is less than 1/4 of the capacity.
2532	*/
2533	#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
2534
2535	#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
2536	#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
2537	#define VMAP_FLAGS_MASK 0x3
2538
2539	struct vmap_block_queue {
2540	spinlock_t lock;
2541	struct list_head free;
2542
2543	/*
2544	* An xarray requires an extra memory dynamically to
2545	* be allocated. If it is an issue, we can use rb-tree
2546	* instead.
2547	*/
2548	struct xarray vmap_blocks;
2549	};
2550
2551	struct vmap_block {
2552	spinlock_t lock;
2553	struct vmap_area *va;
2554	unsigned long free, dirty;
2555	DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
2556	unsigned long dirty_min, dirty_max; /< dirty range /
2557	struct list_head free_list;
2558	struct rcu_head rcu_head;
2559	struct list_head purge;
2560	unsigned int cpu;
2561	};
2562
2563	/ Queue of free and dirty vmap blocks, for allocation and flushing purposes /
2564	static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
2565
2566	/*
2567	* In order to fast access to any "vmap_block" associated with a
2568	* specific address, we use a hash.
2569	*
2570	* A per-cpu vmap_block_queue is used in both ways, to serialize
2571	* an access to free block chains among CPUs(alloc path) and it
2572	* also acts as a vmap_block hash(alloc/free paths). It means we
2573	* overload it, since we already have the per-cpu array which is
2574	* used as a hash table. When used as a hash a 'cpu' passed to
2575	* per_cpu() is not actually a CPU but rather a hash index.
2576	*
2577	* A hash function is addr_to_vb_xa() which hashes any address
2578	* to a specific index(in a hash) it belongs to. This then uses a
2579	* per_cpu() macro to access an array with generated index.
2580	*
2581	* An example:
2582	*
2583	* CPU_1 CPU_2 CPU_0
2584	* \| \| \|
2585	* V V V
2586	* 0 10 20 30 40 50 60
2587	* \|------\|------\|------\|------\|------\|------\|...<vmap address space>
2588	* CPU0 CPU1 CPU2 CPU0 CPU1 CPU2
2589	*
2590	* - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
2591	* it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
2592	*
2593	* - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
2594	* it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
2595	*
2596	* - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
2597	* it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
2598	*
2599	* This technique almost always avoids lock contention on insert/remove,
2600	* however xarray spinlocks protect against any contention that remains.
2601	*/
2602	static struct xarray *
2603	addr_to_vb_xa(unsigned long addr)
2604	{
2605	int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;
2606
2607	/*
2608	* Please note, nr_cpu_ids points on a highest set
2609	* possible bit, i.e. we never invoke cpumask_next()
2610	* if an index points on it which is nr_cpu_ids - 1.
2611	*/
2612	if (!cpu_possible(cpu: index))
2613	index = cpumask_next(n: index, cpu_possible_mask);
2614
2615	return &per_cpu(vmap_block_queue, index).vmap_blocks;
2616	}
2617
2618	/*
2619	* We should probably have a fallback mechanism to allocate virtual memory
2620	* out of partially filled vmap blocks. However vmap block sizing should be
2621	* fairly reasonable according to the vmalloc size, so it shouldn't be a
2622	* big problem.
2623	*/
2624
2625	static unsigned long addr_to_vb_idx(unsigned long addr)
2626	{
2627	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-`1`);
2628	addr /= VMAP_BLOCK_SIZE;
2629	return addr;
2630	}
2631
2632	static void vmap_block_vaddr(unsigned* long va_start, unsigned long pages_off)
2633	{
2634	unsigned long addr;
2635
2636	addr = va_start + (pages_off << PAGE_SHIFT);
2637	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
2638	return (void *)addr;
2639	}
2640
2641	/**
2642	* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
2643	* block. Of course pages number can't exceed VMAP_BBMAP_BITS
2644	* @order: how many 2^order pages should be occupied in newly allocated block
2645	* @gfp_mask: flags for the page level allocator
2646	*
2647	* Return: virtual address in a newly allocated block or ERR_PTR(-errno)
2648	*/
2649	static void new_vmap_block(unsigned* int order, gfp_t gfp_mask)
2650	{
2651	struct vmap_block_queue *vbq;
2652	struct vmap_block *vb;
2653	struct vmap_area *va;
2654	struct xarray *xa;
2655	unsigned long vb_idx;
2656	int node, err;
2657	void *vaddr;
2658
2659	node = numa_node_id();
2660
2661	vb = kmalloc_node(sizeof(struct vmap_block),
2662	gfp_mask & GFP_RECLAIM_MASK, node);
2663	if (unlikely(!vb))
2664	return ERR_PTR(error: -ENOMEM);
2665
2666	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
2667	VMALLOC_START, VMALLOC_END,
2668	node, gfp_mask,
2669	VMAP_RAM\|VMAP_BLOCK, NULL);
2670	if (IS_ERR(ptr: va)) {
2671	kfree(objp: vb);
2672	return ERR_CAST(ptr: va);
2673	}
2674
2675	vaddr = vmap_block_vaddr(va_start: va->va_start, pages_off: `0`);
2676	spin_lock_init(&vb->lock);
2677	vb->va = va;
2678	/ At least something should be left free /
2679	BUG_ON(VMAP_BBMAP_BITS <= (`1UL` << order));
2680	bitmap_zero(dst: vb->used_map, VMAP_BBMAP_BITS);
2681	vb->free = VMAP_BBMAP_BITS - (`1UL` << order);
2682	vb->dirty = `0`;
2683	vb->dirty_min = VMAP_BBMAP_BITS;
2684	vb->dirty_max = `0`;
2685	bitmap_set(map: vb->used_map, start: `0`, nbits: (`1UL` << order));
2686	INIT_LIST_HEAD(list: &vb->free_list);
2687	vb->cpu = raw_smp_processor_id();
2688
2689	xa = addr_to_vb_xa(addr: va->va_start);
2690	vb_idx = addr_to_vb_idx(addr: va->va_start);
2691	err = xa_insert(xa, index: vb_idx, entry: vb, gfp: gfp_mask);
2692	if (err) {
2693	kfree(objp: vb);
2694	free_vmap_area(va);
2695	return ERR_PTR(error: err);
2696	}
2697	/*
2698	* list_add_tail_rcu could happened in another core
2699	* rather than vb->cpu due to task migration, which
2700	* is safe as list_add_tail_rcu will ensure the list's
2701	* integrity together with list_for_each_rcu from read
2702	* side.
2703	*/
2704	vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
2705	spin_lock(lock: &vbq->lock);
2706	list_add_tail_rcu(new: &vb->free_list, head: &vbq->free);
2707	spin_unlock(lock: &vbq->lock);
2708
2709	return vaddr;
2710	}
2711
2712	static void free_vmap_block(struct vmap_block *vb)
2713	{
2714	struct vmap_node *vn;
2715	struct vmap_block *tmp;
2716	struct xarray *xa;
2717
2718	xa = addr_to_vb_xa(addr: vb->va->va_start);
2719	tmp = xa_erase(xa, index: addr_to_vb_idx(addr: vb->va->va_start));
2720	BUG_ON(tmp != vb);
2721
2722	vn = addr_to_node(addr: vb->va->va_start);
2723	spin_lock(lock: &vn->busy.lock);
2724	unlink_va(va: vb->va, root: &vn->busy.root);
2725	spin_unlock(lock: &vn->busy.lock);
2726
2727	free_vmap_area_noflush(va: vb->va);
2728	kfree_rcu(vb, rcu_head);
2729	}
2730
2731	static bool purge_fragmented_block(struct vmap_block *vb,
2732	struct list_head *purge_list, bool force_purge)
2733	{
2734	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
2735
2736	if (vb->free + vb->dirty != VMAP_BBMAP_BITS \|\|
2737	vb->dirty == VMAP_BBMAP_BITS)
2738	return false;
2739
2740	/ Don't overeagerly purge usable blocks unless requested /
2741	if (!(force_purge \|\| vb->free < VMAP_PURGE_THRESHOLD))
2742	return false;
2743
2744	/ prevent further allocs after releasing lock /
2745	WRITE_ONCE(vb->free, `0`);
2746	/ prevent purging it again /
2747	WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
2748	vb->dirty_min = `0`;
2749	vb->dirty_max = VMAP_BBMAP_BITS;
2750	spin_lock(lock: &vbq->lock);
2751	list_del_rcu(entry: &vb->free_list);
2752	spin_unlock(lock: &vbq->lock);
2753	list_add_tail(new: &vb->purge, head: purge_list);
2754	return true;
2755	}
2756
2757	static void free_purged_blocks(struct list_head *purge_list)
2758	{
2759	struct vmap_block vb, n_vb;
2760
2761	list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
2762	list_del(entry: &vb->purge);
2763	free_vmap_block(vb);
2764	}
2765	}
2766
2767	static void purge_fragmented_blocks(int cpu)
2768	{
2769	LIST_HEAD(purge);
2770	struct vmap_block *vb;
2771	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2772
2773	rcu_read_lock();
2774	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
2775	unsigned long free = READ_ONCE(vb->free);
2776	unsigned long dirty = READ_ONCE(vb->dirty);
2777
2778	if (free + dirty != VMAP_BBMAP_BITS \|\|
2779	dirty == VMAP_BBMAP_BITS)
2780	continue;
2781
2782	spin_lock(lock: &vb->lock);
2783	purge_fragmented_block(vb, purge_list: &purge, force_purge: true);
2784	spin_unlock(lock: &vb->lock);
2785	}
2786	rcu_read_unlock();
2787	free_purged_blocks(purge_list: &purge);
2788	}
2789
2790	static void purge_fragmented_blocks_allcpus(void)
2791	{
2792	int cpu;
2793
2794	for_each_possible_cpu(cpu)
2795	purge_fragmented_blocks(cpu);
2796	}
2797
2798	static void vb_alloc(unsigned* long size, gfp_t gfp_mask)
2799	{
2800	struct vmap_block_queue *vbq;
2801	struct vmap_block *vb;
2802	void *vaddr = NULL;
2803	unsigned int order;
2804
2805	BUG_ON(offset_in_page(size));
2806	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
2807	if (WARN_ON(size == `0`)) {
2808	/*
2809	* Allocating 0 bytes isn't what caller wants since
2810	* get_order(0) returns funny result. Just warn and terminate
2811	* early.
2812	*/
2813	return ERR_PTR(error: -EINVAL);
2814	}
2815	order = get_order(size);
2816
2817	rcu_read_lock();
2818	vbq = raw_cpu_ptr(&vmap_block_queue);
2819	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
2820	unsigned long pages_off;
2821
2822	if (READ_ONCE(vb->free) < (`1UL` << order))
2823	continue;
2824
2825	spin_lock(lock: &vb->lock);
2826	if (vb->free < (`1UL` << order)) {
2827	spin_unlock(lock: &vb->lock);
2828	continue;
2829	}
2830
2831	pages_off = VMAP_BBMAP_BITS - vb->free;
2832	vaddr = vmap_block_vaddr(va_start: vb->va->va_start, pages_off);
2833	WRITE_ONCE(vb->free, vb->free - (`1UL` << order));
2834	bitmap_set(map: vb->used_map, start: pages_off, nbits: (`1UL` << order));
2835	if (vb->free == `0`) {
2836	spin_lock(lock: &vbq->lock);
2837	list_del_rcu(entry: &vb->free_list);
2838	spin_unlock(lock: &vbq->lock);
2839	}
2840
2841	spin_unlock(lock: &vb->lock);
2842	break;
2843	}
2844
2845	rcu_read_unlock();
2846
2847	/ Allocate new block if nothing was found /
2848	if (!vaddr)
2849	vaddr = new_vmap_block(order, gfp_mask);
2850
2851	return vaddr;
2852	}
2853
2854	static void vb_free(unsigned long addr, unsigned long size)
2855	{
2856	unsigned long offset;
2857	unsigned int order;
2858	struct vmap_block *vb;
2859	struct xarray *xa;
2860
2861	BUG_ON(offset_in_page(size));
2862	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
2863
2864	flush_cache_vunmap(start: addr, end: addr + size);
2865
2866	order = get_order(size);
2867	offset = (addr & (VMAP_BLOCK_SIZE - `1`)) >> PAGE_SHIFT;
2868
2869	xa = addr_to_vb_xa(addr);
2870	vb = xa_load(xa, index: addr_to_vb_idx(addr));
2871
2872	spin_lock(lock: &vb->lock);
2873	bitmap_clear(map: vb->used_map, start: offset, nbits: (`1UL` << order));
2874	spin_unlock(lock: &vb->lock);
2875
2876	vunmap_range_noflush(start: addr, end: addr + size);
2877
2878	if (debug_pagealloc_enabled_static())
2879	flush_tlb_kernel_range(start: addr, end: addr + size);
2880
2881	spin_lock(lock: &vb->lock);
2882
2883	/ Expand the not yet TLB flushed dirty range /
2884	vb->dirty_min = min(vb->dirty_min, offset);
2885	vb->dirty_max = max(vb->dirty_max, offset + (`1UL` << order));
2886
2887	WRITE_ONCE(vb->dirty, vb->dirty + (`1UL` << order));
2888	if (vb->dirty == VMAP_BBMAP_BITS) {
2889	BUG_ON(vb->free);
2890	spin_unlock(lock: &vb->lock);
2891	free_vmap_block(vb);
2892	} else
2893	spin_unlock(lock: &vb->lock);
2894	}
2895
2896	static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
2897	{
2898	LIST_HEAD(purge_list);
2899	int cpu;
2900
2901	if (unlikely(!vmap_initialized))
2902	return;
2903
2904	mutex_lock(&vmap_purge_lock);
2905
2906	for_each_possible_cpu(cpu) {
2907	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2908	struct vmap_block *vb;
2909	unsigned long idx;
2910
2911	rcu_read_lock();
2912	xa_for_each(&vbq->vmap_blocks, idx, vb) {
2913	spin_lock(lock: &vb->lock);
2914
2915	/*
2916	* Try to purge a fragmented block first. If it's
2917	* not purgeable, check whether there is dirty
2918	* space to be flushed.
2919	*/
2920	if (!purge_fragmented_block(vb, purge_list: &purge_list, force_purge: false) &&
2921	vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
2922	unsigned long va_start = vb->va->va_start;
2923	unsigned long s, e;
2924
2925	s = va_start + (vb->dirty_min << PAGE_SHIFT);
2926	e = va_start + (vb->dirty_max << PAGE_SHIFT);
2927
2928	start = min(s, start);
2929	end = max(e, end);
2930
2931	/ Prevent that this is flushed again /
2932	vb->dirty_min = VMAP_BBMAP_BITS;
2933	vb->dirty_max = `0`;
2934
2935	flush = `1`;
2936	}
2937	spin_unlock(lock: &vb->lock);
2938	}
2939	rcu_read_unlock();
2940	}
2941	free_purged_blocks(purge_list: &purge_list);
2942
2943	if (!__purge_vmap_area_lazy(start, end, full_pool_decay: false) && flush)
2944	flush_tlb_kernel_range(start, end);
2945	mutex_unlock(lock: &vmap_purge_lock);
2946	}
2947
2948	/**
2949	* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
2950	*
2951	* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
2952	* to amortize TLB flushing overheads. What this means is that any page you
2953	* have now, may, in a former life, have been mapped into kernel virtual
2954	* address by the vmap layer and so there might be some CPUs with TLB entries
2955	* still referencing that page (additional to the regular 1:1 kernel mapping).
2956	*
2957	* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
2958	* be sure that none of the pages we have control over will have any aliases
2959	* from the vmap layer.
2960	*/
2961	void vm_unmap_aliases(void)
2962	{
2963	_vm_unmap_aliases(ULONG_MAX, end: `0`, flush: `0`);
2964	}
2965	EXPORT_SYMBOL_GPL(vm_unmap_aliases);
2966
2967	/**
2968	* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
2969	* @mem: the pointer returned by vm_map_ram
2970	* @count: the count passed to that vm_map_ram call (cannot unmap partial)
2971	*/
2972	void vm_unmap_ram(const void mem, unsigned* int count)
2973	{
2974	unsigned long size = (unsigned long)count << PAGE_SHIFT;
2975	unsigned long addr = (unsigned long)kasan_reset_tag(addr: mem);
2976	struct vmap_area *va;
2977
2978	might_sleep();
2979	BUG_ON(!addr);
2980	BUG_ON(addr < VMALLOC_START);
2981	BUG_ON(addr > VMALLOC_END);
2982	BUG_ON(!PAGE_ALIGNED(addr));
2983
2984	kasan_poison_vmalloc(start: mem, size);
2985
2986	if (likely(count <= VMAP_MAX_ALLOC)) {
2987	debug_check_no_locks_freed(from: mem, len: size);
2988	vb_free(addr, size);
2989	return;
2990	}
2991
2992	va = find_unlink_vmap_area(addr);
2993	if (WARN_ON_ONCE(!va))
2994	return;
2995
2996	debug_check_no_locks_freed(from: (void *)va->va_start, len: va_size(va));
2997	free_unmap_vmap_area(va);
2998	}
2999	EXPORT_SYMBOL(vm_unmap_ram);
3000
3001	/**
3002	* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
3003	* @pages: an array of pointers to the pages to be mapped
3004	* @count: number of pages
3005	* @node: prefer to allocate data structures on this node
3006	*
3007	* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
3008	* faster than vmap so it's good. But if you mix long-life and short-life
3009	* objects with vm_map_ram(), it could consume lots of address space through
3010	* fragmentation (especially on a 32bit machine). You could see failures in
3011	* the end. Please use this function for short-lived objects.
3012	*
3013	* Returns: a pointer to the address that has been mapped, or %NULL on failure
3014	*/
3015	void vm_map_ram(struct* page *pages, unsigned* int count, int node)
3016	{
3017	unsigned long size = (unsigned long)count << PAGE_SHIFT;
3018	unsigned long addr;
3019	void *mem;
3020
3021	if (likely(count <= VMAP_MAX_ALLOC)) {
3022	mem = vb_alloc(size, GFP_KERNEL);
3023	if (IS_ERR(ptr: mem))
3024	return NULL;
3025	addr = (unsigned long)mem;
3026	} else {
3027	struct vmap_area *va;
3028	va = alloc_vmap_area(size, PAGE_SIZE,
3029	VMALLOC_START, VMALLOC_END,
3030	node, GFP_KERNEL, VMAP_RAM,
3031	NULL);
3032	if (IS_ERR(ptr: va))
3033	return NULL;
3034
3035	addr = va->va_start;
3036	mem = (void *)addr;
3037	}
3038
3039	if (vmap_pages_range(addr, end: addr + size, PAGE_KERNEL,
3040	pages, PAGE_SHIFT) < `0`) {
3041	vm_unmap_ram(mem, count);
3042	return NULL;
3043	}
3044
3045	/*
3046	* Mark the pages as accessible, now that they are mapped.
3047	* With hardware tag-based KASAN, marking is skipped for
3048	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
3049	*/
3050	mem = kasan_unpoison_vmalloc(start: mem, size, KASAN_VMALLOC_PROT_NORMAL);
3051
3052	return mem;
3053	}
3054	EXPORT_SYMBOL(vm_map_ram);
3055
3056	static struct vm_struct *vmlist __initdata;
3057
3058	static inline unsigned int vm_area_page_order(struct vm_struct *vm)
3059	{
3060	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
3061	return vm->page_order;
3062	#else
3063	return `0`;
3064	#endif
3065	}
3066
3067	unsigned int get_vm_area_page_order(struct vm_struct *vm)
3068	{
3069	return vm_area_page_order(vm);
3070	}
3071
3072	static inline void set_vm_area_page_order(struct vm_struct vm, unsigned* int order)
3073	{
3074	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
3075	vm->page_order = order;
3076	#else
3077	BUG_ON(order != `0`);
3078	#endif
3079	}
3080
3081	/**
3082	* vm_area_add_early - add vmap area early during boot
3083	* @vm: vm_struct to add
3084	*
3085	* This function is used to add fixed kernel vm area to vmlist before
3086	* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
3087	* should contain proper values and the other fields should be zero.
3088	*
3089	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
3090	*/
3091	void __init vm_area_add_early(struct vm_struct *vm)
3092	{
3093	struct vm_struct tmp, *p;
3094
3095	BUG_ON(vmap_initialized);
3096	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
3097	if (tmp->addr >= vm->addr) {
3098	BUG_ON(tmp->addr < vm->addr + vm->size);
3099	break;
3100	} else
3101	BUG_ON(tmp->addr + tmp->size > vm->addr);
3102	}
3103	vm->next = *p;
3104	*p = vm;
3105	}
3106
3107	/**
3108	* vm_area_register_early - register vmap area early during boot
3109	* @vm: vm_struct to register
3110	* @align: requested alignment
3111	*
3112	* This function is used to register kernel vm area before
3113	* vmalloc_init() is called. @vm->size and @vm->flags should contain
3114	* proper values on entry and other fields should be zero. On return,
3115	* vm->addr contains the allocated address.
3116	*
3117	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
3118	*/
3119	void __init vm_area_register_early(struct vm_struct *vm, size_t align)
3120	{
3121	unsigned long addr = ALIGN(VMALLOC_START, align);
3122	struct vm_struct cur, *p;
3123
3124	BUG_ON(vmap_initialized);
3125
3126	for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
3127	if ((unsigned long)cur->addr - addr >= vm->size)
3128	break;
3129	addr = ALIGN((unsigned long)cur->addr + cur->size, align);
3130	}
3131
3132	BUG_ON(addr > VMALLOC_END - vm->size);
3133	vm->addr = (void *)addr;
3134	vm->next = *p;
3135	*p = vm;
3136	kasan_populate_early_vm_area_shadow(start: vm->addr, size: vm->size);
3137	}
3138
3139	static void clear_vm_uninitialized_flag(struct vm_struct *vm)
3140	{
3141	/*
3142	* Before removing VM_UNINITIALIZED,
3143	* we should make sure that vm has proper values.
3144	* Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
3145	*/
3146	smp_wmb();
3147	vm->flags &= ~VM_UNINITIALIZED;
3148	}
3149
3150	struct vm_struct __get_vm_area_node(unsigned* long size,
3151	unsigned long align, unsigned long shift, unsigned long flags,
3152	unsigned long start, unsigned long end, int node,
3153	gfp_t gfp_mask, const void *caller)
3154	{
3155	struct vmap_area *va;
3156	struct vm_struct *area;
3157	unsigned long requested_size = size;
3158
3159	BUG_ON(in_interrupt());
3160	size = ALIGN(size, `1ul` << shift);
3161	if (unlikely(!size))
3162	return NULL;
3163
3164	if (flags & VM_IOREMAP)
3165	align = `1ul` << clamp_t(int, get_count_order_long(size),
3166	PAGE_SHIFT, IOREMAP_MAX_ORDER);
3167
3168	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
3169	if (unlikely(!area))
3170	return NULL;
3171
3172	if (!(flags & VM_NO_GUARD))
3173	size += PAGE_SIZE;
3174
3175	area->flags = flags;
3176	area->caller = caller;
3177	area->requested_size = requested_size;
3178
3179	va = alloc_vmap_area(size, align, vstart: start, vend: end, node, gfp_mask, va_flags: `0`, vm: area);
3180	if (IS_ERR(ptr: va)) {
3181	kfree(objp: area);
3182	return NULL;
3183	}
3184
3185	/*
3186	* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
3187	* best-effort approach, as they can be mapped outside of vmalloc code.
3188	* For VM_ALLOC mappings, the pages are marked as accessible after
3189	* getting mapped in __vmalloc_node_range().
3190	* With hardware tag-based KASAN, marking is skipped for
3191	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
3192	*/
3193	if (!(flags & VM_ALLOC))
3194	area->addr = kasan_unpoison_vmalloc(start: area->addr, size: requested_size,
3195	KASAN_VMALLOC_PROT_NORMAL);
3196
3197	return area;
3198	}
3199
3200	struct vm_struct __get_vm_area_caller(unsigned* long size, unsigned long flags,
3201	unsigned long start, unsigned long end,
3202	const void *caller)
3203	{
3204	return __get_vm_area_node(size, align: `1`, PAGE_SHIFT, flags, start, end,
3205	NUMA_NO_NODE, GFP_KERNEL, caller);
3206	}
3207
3208	/**
3209	* get_vm_area - reserve a contiguous kernel virtual area
3210	* @size: size of the area
3211	* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
3212	*
3213	* Search an area of @size in the kernel virtual mapping area,
3214	* and reserved it for out purposes. Returns the area descriptor
3215	* on success or %NULL on failure.
3216	*
3217	* Return: the area descriptor on success or %NULL on failure.
3218	*/
3219	struct vm_struct get_vm_area(unsigned* long size, unsigned long flags)
3220	{
3221	return __get_vm_area_node(size, align: `1`, PAGE_SHIFT, flags,
3222	VMALLOC_START, VMALLOC_END,
3223	NUMA_NO_NODE, GFP_KERNEL,
3224	caller: __builtin_return_address(`0`));
3225	}
3226
3227	struct vm_struct get_vm_area_caller(unsigned* long size, unsigned long flags,
3228	const void *caller)
3229	{
3230	return __get_vm_area_node(size, align: `1`, PAGE_SHIFT, flags,
3231	VMALLOC_START, VMALLOC_END,
3232	NUMA_NO_NODE, GFP_KERNEL, caller);
3233	}
3234
3235	/**
3236	* find_vm_area - find a continuous kernel virtual area
3237	* @addr: base address
3238	*
3239	* Search for the kernel VM area starting at @addr, and return it.
3240	* It is up to the caller to do all required locking to keep the returned
3241	* pointer valid.
3242	*
3243	* Return: the area descriptor on success or %NULL on failure.
3244	*/
3245	struct vm_struct find_vm_area(const* void *addr)
3246	{
3247	struct vmap_area *va;
3248
3249	va = find_vmap_area(addr: (unsigned long)addr);
3250	if (!va)
3251	return NULL;
3252
3253	return va->vm;
3254	}
3255
3256	/**
3257	* remove_vm_area - find and remove a continuous kernel virtual area
3258	* @addr: base address
3259	*
3260	* Search for the kernel VM area starting at @addr, and remove it.
3261	* This function returns the found VM area, but using it is NOT safe
3262	* on SMP machines, except for its size or flags.
3263	*
3264	* Return: the area descriptor on success or %NULL on failure.
3265	*/
3266	struct vm_struct remove_vm_area(const* void *addr)
3267	{
3268	struct vmap_area *va;
3269	struct vm_struct *vm;
3270
3271	might_sleep();
3272
3273	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
3274	addr))
3275	return NULL;
3276
3277	va = find_unlink_vmap_area(addr: (unsigned long)addr);
3278	if (!va \|\| !va->vm)
3279	return NULL;
3280	vm = va->vm;
3281
3282	debug_check_no_locks_freed(from: vm->addr, len: get_vm_area_size(area: vm));
3283	debug_check_no_obj_freed(address: vm->addr, size: get_vm_area_size(area: vm));
3284	kasan_free_module_shadow(vm);
3285	kasan_poison_vmalloc(start: vm->addr, size: get_vm_area_size(area: vm));
3286
3287	free_unmap_vmap_area(va);
3288	return vm;
3289	}
3290
3291	static inline void set_area_direct_map(const struct vm_struct *area,
3292	int (set_direct_map)(struct* page *page))
3293	{
3294	int i;
3295
3296	/ HUGE_VMALLOC passes small pages to set_direct_map /
3297	for (i = `0`; i < area->nr_pages; i++)
3298	if (page_address(area->pages[i]))
3299	set_direct_map(area->pages[i]);
3300	}
3301
3302	/*
3303	* Flush the vm mapping and reset the direct map.
3304	*/
3305	static void vm_reset_perms(struct vm_struct *area)
3306	{
3307	unsigned long start = ULONG_MAX, end = `0`;
3308	unsigned int page_order = vm_area_page_order(vm: area);
3309	int flush_dmap = `0`;
3310	int i;
3311
3312	/*
3313	* Find the start and end range of the direct mappings to make sure that
3314	* the vm_unmap_aliases() flush includes the direct map.
3315	*/
3316	for (i = `0`; i < area->nr_pages; i += `1U` << page_order) {
3317	unsigned long addr = (unsigned long)page_address(area->pages[i]);
3318
3319	if (addr) {
3320	unsigned long page_size;
3321
3322	page_size = PAGE_SIZE << page_order;
3323	start = min(addr, start);
3324	end = max(addr + page_size, end);
3325	flush_dmap = `1`;
3326	}
3327	}
3328
3329	/*
3330	* Set direct map to something invalid so that it won't be cached if
3331	* there are any accesses after the TLB flush, then flush the TLB and
3332	* reset the direct map permissions to the default.
3333	*/
3334	set_area_direct_map(area, set_direct_map: set_direct_map_invalid_noflush);
3335	_vm_unmap_aliases(start, end, flush: flush_dmap);
3336	set_area_direct_map(area, set_direct_map: set_direct_map_default_noflush);
3337	}
3338
3339	static void delayed_vfree_work(struct work_struct *w)
3340	{
3341	struct vfree_deferred p = container_of(w, struct* vfree_deferred, wq);
3342	struct llist_node t, llnode;
3343
3344	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
3345	vfree(addr: llnode);
3346	}
3347
3348	/**
3349	* vfree_atomic - release memory allocated by vmalloc()
3350	* @addr: memory base address
3351	*
3352	* This one is just like vfree() but can be called in any atomic context
3353	* except NMIs.
3354	*/
3355	void vfree_atomic(const void *addr)
3356	{
3357	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
3358
3359	BUG_ON(in_nmi());
3360	kmemleak_free(ptr: addr);
3361
3362	/*
3363	* Use raw_cpu_ptr() because this can be called from preemptible
3364	* context. Preemption is absolutely fine here, because the llist_add()
3365	* implementation is lockless, so it works even if we are adding to
3366	* another cpu's list. schedule_work() should be fine with this too.
3367	*/
3368	if (addr && llist_add(new: (struct llist_node *)addr, head: &p->list))
3369	schedule_work(work: &p->wq);
3370	}
3371
3372	/**
3373	* vfree - Release memory allocated by vmalloc()
3374	* @addr: Memory base address
3375	*
3376	* Free the virtually continuous memory area starting at @addr, as obtained
3377	* from one of the vmalloc() family of APIs. This will usually also free the
3378	* physical memory underlying the virtual allocation, but that memory is
3379	* reference counted, so it will not be freed until the last user goes away.
3380	*
3381	* If @addr is NULL, no operation is performed.
3382	*
3383	* Context:
3384	* May sleep if called not from interrupt context.
3385	* Must not be called in NMI context (strictly speaking, it could be
3386	* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
3387	* conventions for vfree() arch-dependent would be a really bad idea).
3388	*/
3389	void vfree(const void *addr)
3390	{
3391	struct vm_struct *vm;
3392	int i;
3393
3394	if (unlikely(in_interrupt())) {
3395	vfree_atomic(addr);
3396	return;
3397	}
3398
3399	BUG_ON(in_nmi());
3400	kmemleak_free(ptr: addr);
3401	might_sleep();
3402
3403	if (!addr)
3404	return;
3405
3406	vm = remove_vm_area(addr);
3407	if (unlikely(!vm)) {
3408	WARN(`1`, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
3409	addr);
3410	return;
3411	}
3412
3413	if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
3414	vm_reset_perms(area: vm);
3415	/ All pages of vm should be charged to same memcg, so use first one. /
3416	if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
3417	mod_memcg_page_state(page: vm->pages[`0`], idx: MEMCG_VMALLOC, val: -vm->nr_pages);
3418	for (i = `0`; i < vm->nr_pages; i++) {
3419	struct page *page = vm->pages[i];
3420
3421	BUG_ON(!page);
3422	/*
3423	* High-order allocs for huge vmallocs are split, so
3424	* can be freed as an array of order-0 allocations
3425	*/
3426	__free_page(page);
3427	cond_resched();
3428	}
3429	if (!(vm->flags & VM_MAP_PUT_PAGES))
3430	atomic_long_sub(i: vm->nr_pages, v: &nr_vmalloc_pages);
3431	kvfree(addr: vm->pages);
3432	kfree(objp: vm);
3433	}
3434	EXPORT_SYMBOL(vfree);
3435
3436	/**
3437	* vunmap - release virtual mapping obtained by vmap()
3438	* @addr: memory base address
3439	*
3440	* Free the virtually contiguous memory area starting at @addr,
3441	* which was created from the page array passed to vmap().
3442	*
3443	* Must not be called in interrupt context.
3444	*/
3445	void vunmap(const void *addr)
3446	{
3447	struct vm_struct *vm;
3448
3449	BUG_ON(in_interrupt());
3450	might_sleep();
3451
3452	if (!addr)
3453	return;
3454	vm = remove_vm_area(addr);
3455	if (unlikely(!vm)) {
3456	WARN(`1`, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
3457	addr);
3458	return;
3459	}
3460	kfree(objp: vm);
3461	}
3462	EXPORT_SYMBOL(vunmap);
3463
3464	/**
3465	* vmap - map an array of pages into virtually contiguous space
3466	* @pages: array of page pointers
3467	* @count: number of pages to map
3468	* @flags: vm_area->flags
3469	* @prot: page protection for the mapping
3470	*
3471	* Maps @count pages from @pages into contiguous kernel virtual space.
3472	* If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
3473	* (which must be kmalloc or vmalloc memory) and one reference per pages in it
3474	* are transferred from the caller to vmap(), and will be freed / dropped when
3475	* vfree() is called on the return value.
3476	*
3477	* Return: the address of the area or %NULL on failure
3478	*/
3479	void vmap(struct* page *pages, unsigned* int count,
3480	unsigned long flags, pgprot_t prot)
3481	{
3482	struct vm_struct *area;
3483	unsigned long addr;
3484	unsigned long size; / In bytes /
3485
3486	might_sleep();
3487
3488	if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
3489	return NULL;
3490
3491	/*
3492	* Your top guard is someone else's bottom guard. Not having a top
3493	* guard compromises someone else's mappings too.
3494	*/
3495	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
3496	flags &= ~VM_NO_GUARD;
3497
3498	if (count > totalram_pages())
3499	return NULL;
3500
3501	size = (unsigned long)count << PAGE_SHIFT;
3502	area = get_vm_area_caller(size, flags, caller: __builtin_return_address(`0`));
3503	if (!area)
3504	return NULL;
3505
3506	addr = (unsigned long)area->addr;
3507	if (vmap_pages_range(addr, end: addr + size, pgprot_nx(prot),
3508	pages, PAGE_SHIFT) < `0`) {
3509	vunmap(area->addr);
3510	return NULL;
3511	}
3512
3513	if (flags & VM_MAP_PUT_PAGES) {
3514	area->pages = pages;
3515	area->nr_pages = count;
3516	}
3517	return area->addr;
3518	}
3519	EXPORT_SYMBOL(vmap);
3520
3521	#ifdef CONFIG_VMAP_PFN
3522	struct vmap_pfn_data {
3523	unsigned long *pfns;
3524	pgprot_t prot;
3525	unsigned int idx;
3526	};
3527
3528	static int vmap_pfn_apply(pte_t pte, unsigned* long addr, void *private)
3529	{
3530	struct vmap_pfn_data *data = private;
3531	unsigned long pfn = data->pfns[data->idx];
3532	pte_t ptent;
3533
3534	if (WARN_ON_ONCE(pfn_valid(pfn)))
3535	return -EINVAL;
3536
3537	ptent = pte_mkspecial(pte: pfn_pte(page_nr: pfn, pgprot: data->prot));
3538	set_pte_at(&init_mm, addr, pte, ptent);
3539
3540	data->idx++;
3541	return `0`;
3542	}
3543
3544	/**
3545	* vmap_pfn - map an array of PFNs into virtually contiguous space
3546	* @pfns: array of PFNs
3547	* @count: number of pages to map
3548	* @prot: page protection for the mapping
3549	*
3550	* Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
3551	* the start address of the mapping.
3552	*/
3553	void vmap_pfn(unsigned* long pfns, unsigned* int count, pgprot_t prot)
3554	{
3555	struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
3556	struct vm_struct *area;
3557
3558	area = get_vm_area_caller(size: count * PAGE_SIZE, VM_IOREMAP,
3559	caller: __builtin_return_address(`0`));
3560	if (!area)
3561	return NULL;
3562	if (apply_to_page_range(mm: &init_mm, address: (unsigned long)area->addr,
3563	size: count * PAGE_SIZE, fn: vmap_pfn_apply, data: &data)) {
3564	free_vm_area(area);
3565	return NULL;
3566	}
3567
3568	flush_cache_vmap(start: (unsigned long)area->addr,
3569	end: (unsigned long)area->addr + count * PAGE_SIZE);
3570
3571	return area->addr;
3572	}
3573	EXPORT_SYMBOL_GPL(vmap_pfn);
3574	#endif /* CONFIG_VMAP_PFN */
3575
3576	static inline unsigned int
3577	vm_area_alloc_pages(gfp_t gfp, int nid,
3578	unsigned int order, unsigned int nr_pages, struct page **pages)
3579	{
3580	unsigned int nr_allocated = `0`;
3581	struct page *page;
3582	int i;
3583
3584	/*
3585	* For order-0 pages we make use of bulk allocator, if
3586	* the page array is partly or not at all populated due
3587	* to fails, fallback to a single page allocator that is
3588	* more permissive.
3589	*/
3590	if (!order) {
3591	while (nr_allocated < nr_pages) {
3592	unsigned int nr, nr_pages_request;
3593
3594	/*
3595	* A maximum allowed request is hard-coded and is 100
3596	* pages per call. That is done in order to prevent a
3597	* long preemption off scenario in the bulk-allocator
3598	* so the range is [1:100].
3599	*/
3600	nr_pages_request = min(`100U`, nr_pages - nr_allocated);
3601
3602	/ memory allocation should consider mempolicy, we can't*
3603	* wrongly use nearest node when nid == NUMA_NO_NODE,
3604	* otherwise memory may be allocated in only one node,
3605	* but mempolicy wants to alloc memory by interleaving.
3606	*/
3607	if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
3608	nr = alloc_pages_bulk_mempolicy_noprof(gfp,
3609	nr_pages: nr_pages_request,
3610	page_array: pages + nr_allocated);
3611	else
3612	nr = alloc_pages_bulk_node_noprof(gfp, nid,
3613	nr_pages: nr_pages_request,
3614	page_array: pages + nr_allocated);
3615
3616	nr_allocated += nr;
3617	cond_resched();
3618
3619	/*
3620	* If zero or pages were obtained partly,
3621	* fallback to a single page allocator.
3622	*/
3623	if (nr != nr_pages_request)
3624	break;
3625	}
3626	}
3627
3628	/ High-order pages or fallback path if "bulk" fails. /
3629	while (nr_allocated < nr_pages) {
3630	if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
3631	break;
3632
3633	if (nid == NUMA_NO_NODE)
3634	page = alloc_pages_noprof(gfp, order);
3635	else
3636	page = alloc_pages_node_noprof(nid, gfp_mask: gfp, order);
3637
3638	if (unlikely(!page))
3639	break;
3640
3641	/*
3642	* High-order allocations must be able to be treated as
3643	* independent small pages by callers (as they can with
3644	* small-page vmallocs). Some drivers do their own refcounting
3645	* on vmalloc_to_page() pages, some use page->mapping,
3646	* page->lru, etc.
3647	*/
3648	if (order)
3649	split_page(page, order);
3650
3651	/*
3652	* Careful, we allocate and map page-order pages, but
3653	* tracking is done per PAGE_SIZE page so as to keep the
3654	* vm_struct APIs independent of the physical/mapped size.
3655	*/
3656	for (i = `0`; i < (`1U` << order); i++)
3657	pages[nr_allocated + i] = page + i;
3658
3659	cond_resched();
3660	nr_allocated += `1U` << order;
3661	}
3662
3663	return nr_allocated;
3664	}
3665
3666	static void __vmalloc_area_node(struct* vm_struct *area, gfp_t gfp_mask,
3667	pgprot_t prot, unsigned int page_shift,
3668	int node)
3669	{
3670	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
3671	bool nofail = gfp_mask & __GFP_NOFAIL;
3672	unsigned long addr = (unsigned long)area->addr;
3673	unsigned long size = get_vm_area_size(area);
3674	unsigned long array_size;
3675	unsigned int nr_small_pages = size >> PAGE_SHIFT;
3676	unsigned int page_order;
3677	unsigned int flags;
3678	int ret;
3679
3680	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
3681
3682	if (!(gfp_mask & (GFP_DMA \| GFP_DMA32)))
3683	gfp_mask \|= __GFP_HIGHMEM;
3684
3685	/ Please note that the recursion is strictly bounded. /
3686	if (array_size > PAGE_SIZE) {
3687	area->pages = __vmalloc_node_noprof(size: array_size, align: `1`, gfp_mask: nested_gfp, node,
3688	caller: area->caller);
3689	} else {
3690	area->pages = kmalloc_node_noprof(size: array_size, flags: nested_gfp, node);
3691	}
3692
3693	if (!area->pages) {
3694	warn_alloc(gfp_mask, NULL,
3695	fmt: "vmalloc error: size %lu, failed to allocated page array size %lu",
3696	nr_small_pages * PAGE_SIZE, array_size);
3697	free_vm_area(area);
3698	return NULL;
3699	}
3700
3701	set_vm_area_page_order(vm: area, order: page_shift - PAGE_SHIFT);
3702	page_order = vm_area_page_order(vm: area);
3703
3704	/*
3705	* High-order nofail allocations are really expensive and
3706	* potentially dangerous (pre-mature OOM, disruptive reclaim
3707	* and compaction etc.
3708	*
3709	* Please note, the __vmalloc_node_range_noprof() falls-back
3710	* to order-0 pages if high-order attempt is unsuccessful.
3711	*/
3712	area->nr_pages = vm_area_alloc_pages(gfp: (page_order ?
3713	gfp_mask & ~__GFP_NOFAIL : gfp_mask) \| __GFP_NOWARN,
3714	nid: node, order: page_order, nr_pages: nr_small_pages, pages: area->pages);
3715
3716	atomic_long_add(i: area->nr_pages, v: &nr_vmalloc_pages);
3717	/ All pages of vm should be charged to same memcg, so use first one. /
3718	if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
3719	mod_memcg_page_state(page: area->pages[`0`], idx: MEMCG_VMALLOC,
3720	val: area->nr_pages);
3721
3722	/*
3723	* If not enough pages were obtained to accomplish an
3724	* allocation request, free them via vfree() if any.
3725	*/
3726	if (area->nr_pages != nr_small_pages) {
3727	/*
3728	* vm_area_alloc_pages() can fail due to insufficient memory but
3729	* also:-
3730	*
3731	* - a pending fatal signal
3732	* - insufficient huge page-order pages
3733	*
3734	* Since we always retry allocations at order-0 in the huge page
3735	* case a warning for either is spurious.
3736	*/
3737	if (!fatal_signal_pending(current) && page_order == `0`)
3738	warn_alloc(gfp_mask, NULL,
3739	fmt: "vmalloc error: size %lu, failed to allocate pages",
3740	area->nr_pages * PAGE_SIZE);
3741	goto fail;
3742	}
3743
3744	/*
3745	* page tables allocations ignore external gfp mask, enforce it
3746	* by the scope API
3747	*/
3748	if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == __GFP_IO)
3749	flags = memalloc_nofs_save();
3750	else if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == `0`)
3751	flags = memalloc_noio_save();
3752
3753	do {
3754	ret = vmap_pages_range(addr, end: addr + size, prot, pages: area->pages,
3755	page_shift);
3756	if (nofail && (ret < `0`))
3757	schedule_timeout_uninterruptible(timeout: `1`);
3758	} while (nofail && (ret < `0`));
3759
3760	if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == __GFP_IO)
3761	memalloc_nofs_restore(flags);
3762	else if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == `0`)
3763	memalloc_noio_restore(flags);
3764
3765	if (ret < `0`) {
3766	warn_alloc(gfp_mask, NULL,
3767	fmt: "vmalloc error: size %lu, failed to map pages",
3768	area->nr_pages * PAGE_SIZE);
3769	goto fail;
3770	}
3771
3772	return area->addr;
3773
3774	fail:
3775	vfree(area->addr);
3776	return NULL;
3777	}
3778
3779	/**
3780	* __vmalloc_node_range - allocate virtually contiguous memory
3781	* @size: allocation size
3782	* @align: desired alignment
3783	* @start: vm area range start
3784	* @end: vm area range end
3785	* @gfp_mask: flags for the page level allocator
3786	* @prot: protection mask for the allocated pages
3787	* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
3788	* @node: node to use for allocation or NUMA_NO_NODE
3789	* @caller: caller's return address
3790	*
3791	* Allocate enough pages to cover @size from the page level
3792	* allocator with @gfp_mask flags. Please note that the full set of gfp
3793	* flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
3794	* supported.
3795	* Zone modifiers are not supported. From the reclaim modifiers
3796	* __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
3797	* and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
3798	* __GFP_RETRY_MAYFAIL are not supported).
3799	*
3800	* __GFP_NOWARN can be used to suppress failures messages.
3801	*
3802	* Map them into contiguous kernel virtual space, using a pagetable
3803	* protection of @prot.
3804	*
3805	* Return: the address of the area or %NULL on failure
3806	*/
3807	void __vmalloc_node_range_noprof(unsigned* long size, unsigned long align,
3808	unsigned long start, unsigned long end, gfp_t gfp_mask,
3809	pgprot_t prot, unsigned long vm_flags, int node,
3810	const void *caller)
3811	{
3812	struct vm_struct *area;
3813	void *ret;
3814	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
3815	unsigned long original_align = align;
3816	unsigned int shift = PAGE_SHIFT;
3817
3818	if (WARN_ON_ONCE(!size))
3819	return NULL;
3820
3821	if ((size >> PAGE_SHIFT) > totalram_pages()) {
3822	warn_alloc(gfp_mask, NULL,
3823	fmt: "vmalloc error: size %lu, exceeds total pages",
3824	size);
3825	return NULL;
3826	}
3827
3828	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
3829	/*
3830	* Try huge pages. Only try for PAGE_KERNEL allocations,
3831	* others like modules don't yet expect huge pages in
3832	* their allocations due to apply_to_page_range not
3833	* supporting them.
3834	*/
3835
3836	if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
3837	shift = PMD_SHIFT;
3838	else
3839	shift = arch_vmap_pte_supported_shift(size);
3840
3841	align = max(original_align, `1UL` << shift);
3842	}
3843
3844	again:
3845	area = __get_vm_area_node(size, align, shift, VM_ALLOC \|
3846	VM_UNINITIALIZED \| vm_flags, start, end, node,
3847	gfp_mask, caller);
3848	if (!area) {
3849	bool nofail = gfp_mask & __GFP_NOFAIL;
3850	warn_alloc(gfp_mask, NULL,
3851	fmt: "vmalloc error: size %lu, vm_struct allocation failed%s",
3852	size, (nofail) ? ". Retrying." : "");
3853	if (nofail) {
3854	schedule_timeout_uninterruptible(timeout: `1`);
3855	goto again;
3856	}
3857	goto fail;
3858	}
3859
3860	/*
3861	* Prepare arguments for __vmalloc_area_node() and
3862	* kasan_unpoison_vmalloc().
3863	*/
3864	if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
3865	if (kasan_hw_tags_enabled()) {
3866	/*
3867	* Modify protection bits to allow tagging.
3868	* This must be done before mapping.
3869	*/
3870	prot = arch_vmap_pgprot_tagged(prot);
3871
3872	/*
3873	* Skip page_alloc poisoning and zeroing for physical
3874	* pages backing VM_ALLOC mapping. Memory is instead
3875	* poisoned and zeroed by kasan_unpoison_vmalloc().
3876	*/
3877	gfp_mask \|= __GFP_SKIP_KASAN \| __GFP_SKIP_ZERO;
3878	}
3879
3880	/ Take note that the mapping is PAGE_KERNEL. /
3881	kasan_flags \|= KASAN_VMALLOC_PROT_NORMAL;
3882	}
3883
3884	/ Allocate physical pages and map them into vmalloc space. /
3885	ret = __vmalloc_area_node(area, gfp_mask, prot, page_shift: shift, node);
3886	if (!ret)
3887	goto fail;
3888
3889	/*
3890	* Mark the pages as accessible, now that they are mapped.
3891	* The condition for setting KASAN_VMALLOC_INIT should complement the
3892	* one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
3893	* to make sure that memory is initialized under the same conditions.
3894	* Tag-based KASAN modes only assign tags to normal non-executable
3895	* allocations, see __kasan_unpoison_vmalloc().
3896	*/
3897	kasan_flags \|= KASAN_VMALLOC_VM_ALLOC;
3898	if (!want_init_on_free() && want_init_on_alloc(flags: gfp_mask) &&
3899	(gfp_mask & __GFP_SKIP_ZERO))
3900	kasan_flags \|= KASAN_VMALLOC_INIT;
3901	/ KASAN_VMALLOC_PROT_NORMAL already set if required. /
3902	area->addr = kasan_unpoison_vmalloc(start: area->addr, size, flags: kasan_flags);
3903
3904	/*
3905	* In this function, newly allocated vm_struct has VM_UNINITIALIZED
3906	* flag. It means that vm_struct is not fully initialized.
3907	* Now, it is fully initialized, so remove this flag here.
3908	*/
3909	clear_vm_uninitialized_flag(vm: area);
3910
3911	if (!(vm_flags & VM_DEFER_KMEMLEAK))
3912	kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp: gfp_mask);
3913
3914	return area->addr;
3915
3916	fail:
3917	if (shift > PAGE_SHIFT) {
3918	shift = PAGE_SHIFT;
3919	align = original_align;
3920	goto again;
3921	}
3922
3923	return NULL;
3924	}
3925
3926	/**
3927	* __vmalloc_node - allocate virtually contiguous memory
3928	* @size: allocation size
3929	* @align: desired alignment
3930	* @gfp_mask: flags for the page level allocator
3931	* @node: node to use for allocation or NUMA_NO_NODE
3932	* @caller: caller's return address
3933	*
3934	* Allocate enough pages to cover @size from the page level allocator with
3935	* @gfp_mask flags. Map them into contiguous kernel virtual space.
3936	*
3937	* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
3938	* and __GFP_NOFAIL are not supported
3939	*
3940	* Any use of gfp flags outside of GFP_KERNEL should be consulted
3941	* with mm people.
3942	*
3943	* Return: pointer to the allocated memory or %NULL on error
3944	*/
3945	void __vmalloc_node_noprof(unsigned* long size, unsigned long align,
3946	gfp_t gfp_mask, int node, const void *caller)
3947	{
3948	return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
3949	gfp_mask, PAGE_KERNEL, vm_flags: `0`, node, caller);
3950	}
3951	/*
3952	* This is only for performance analysis of vmalloc and stress purpose.
3953	* It is required by vmalloc test module, therefore do not use it other
3954	* than that.
3955	*/
3956	#ifdef CONFIG_TEST_VMALLOC_MODULE
3957	EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
3958	#endif
3959
3960	void __vmalloc_noprof(unsigned* long size, gfp_t gfp_mask)
3961	{
3962	return __vmalloc_node_noprof(size, align: `1`, gfp_mask, NUMA_NO_NODE,
3963	caller: __builtin_return_address(`0`));
3964	}
3965	EXPORT_SYMBOL(__vmalloc_noprof);
3966
3967	/**
3968	* vmalloc - allocate virtually contiguous memory
3969	* @size: allocation size
3970	*
3971	* Allocate enough pages to cover @size from the page level
3972	* allocator and map them into contiguous kernel virtual space.
3973	*
3974	* For tight control over page level allocator and protection flags
3975	* use __vmalloc() instead.
3976	*
3977	* Return: pointer to the allocated memory or %NULL on error
3978	*/
3979	void vmalloc_noprof(unsigned* long size)
3980	{
3981	return __vmalloc_node_noprof(size, align: `1`, GFP_KERNEL, NUMA_NO_NODE,
3982	caller: __builtin_return_address(`0`));
3983	}
3984	EXPORT_SYMBOL(vmalloc_noprof);
3985
3986	/**
3987	* vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
3988	* @size: allocation size
3989	* @gfp_mask: flags for the page level allocator
3990	* @node: node to use for allocation or NUMA_NO_NODE
3991	*
3992	* Allocate enough pages to cover @size from the page level
3993	* allocator and map them into contiguous kernel virtual space.
3994	* If @size is greater than or equal to PMD_SIZE, allow using
3995	* huge pages for the memory
3996	*
3997	* Return: pointer to the allocated memory or %NULL on error
3998	*/
3999	void vmalloc_huge_node_noprof(unsigned* long size, gfp_t gfp_mask, int node)
4000	{
4001	return __vmalloc_node_range_noprof(size, align: `1`, VMALLOC_START, VMALLOC_END,
4002	gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
4003	node, caller: __builtin_return_address(`0`));
4004	}
4005	EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
4006
4007	/**
4008	* vzalloc - allocate virtually contiguous memory with zero fill
4009	* @size: allocation size
4010	*
4011	* Allocate enough pages to cover @size from the page level
4012	* allocator and map them into contiguous kernel virtual space.
4013	* The memory allocated is set to zero.
4014	*
4015	* For tight control over page level allocator and protection flags
4016	* use __vmalloc() instead.
4017	*
4018	* Return: pointer to the allocated memory or %NULL on error
4019	*/
4020	void vzalloc_noprof(unsigned* long size)
4021	{
4022	return __vmalloc_node_noprof(size, align: `1`, GFP_KERNEL \| __GFP_ZERO, NUMA_NO_NODE,
4023	caller: __builtin_return_address(`0`));
4024	}
4025	EXPORT_SYMBOL(vzalloc_noprof);
4026
4027	/**
4028	* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
4029	* @size: allocation size
4030	*
4031	* The resulting memory area is zeroed so it can be mapped to userspace
4032	* without leaking data.
4033	*
4034	* Return: pointer to the allocated memory or %NULL on error
4035	*/
4036	void vmalloc_user_noprof(unsigned* long size)
4037	{
4038	return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
4039	GFP_KERNEL \| __GFP_ZERO, PAGE_KERNEL,
4040	VM_USERMAP, NUMA_NO_NODE,
4041	caller: __builtin_return_address(`0`));
4042	}
4043	EXPORT_SYMBOL(vmalloc_user_noprof);
4044
4045	/**
4046	* vmalloc_node - allocate memory on a specific node
4047	* @size: allocation size
4048	* @node: numa node
4049	*
4050	* Allocate enough pages to cover @size from the page level
4051	* allocator and map them into contiguous kernel virtual space.
4052	*
4053	* For tight control over page level allocator and protection flags
4054	* use __vmalloc() instead.
4055	*
4056	* Return: pointer to the allocated memory or %NULL on error
4057	*/
4058	void vmalloc_node_noprof(unsigned* long size, int node)
4059	{
4060	return __vmalloc_node_noprof(size, align: `1`, GFP_KERNEL, node,
4061	caller: __builtin_return_address(`0`));
4062	}
4063	EXPORT_SYMBOL(vmalloc_node_noprof);
4064
4065	/**
4066	* vzalloc_node - allocate memory on a specific node with zero fill
4067	* @size: allocation size
4068	* @node: numa node
4069	*
4070	* Allocate enough pages to cover @size from the page level
4071	* allocator and map them into contiguous kernel virtual space.
4072	* The memory allocated is set to zero.
4073	*
4074	* Return: pointer to the allocated memory or %NULL on error
4075	*/
4076	void vzalloc_node_noprof(unsigned* long size, int node)
4077	{
4078	return __vmalloc_node_noprof(size, align: `1`, GFP_KERNEL \| __GFP_ZERO, node,
4079	caller: __builtin_return_address(`0`));
4080	}
4081	EXPORT_SYMBOL(vzalloc_node_noprof);
4082
4083	/**
4084	* vrealloc - reallocate virtually contiguous memory; contents remain unchanged
4085	* @p: object to reallocate memory for
4086	* @size: the size to reallocate
4087	* @flags: the flags for the page level allocator
4088	*
4089	* If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
4090	* @p is not a %NULL pointer, the object pointed to is freed.
4091	*
4092	* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
4093	* initial memory allocation, every subsequent call to this API for the same
4094	* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
4095	* __GFP_ZERO is not fully honored by this API.
4096	*
4097	* In any case, the contents of the object pointed to are preserved up to the
4098	* lesser of the new and old sizes.
4099	*
4100	* This function must not be called concurrently with itself or vfree() for the
4101	* same memory allocation.
4102	*
4103	* Return: pointer to the allocated memory; %NULL if @size is zero or in case of
4104	* failure
4105	*/
4106	void vrealloc_noprof(const* void *p, size_t size, gfp_t flags)
4107	{
4108	struct vm_struct *vm = NULL;
4109	size_t alloced_size = `0`;
4110	size_t old_size = `0`;
4111	void *n;
4112
4113	if (!size) {
4114	vfree(p);
4115	return NULL;
4116	}
4117
4118	if (p) {
4119	vm = find_vm_area(addr: p);
4120	if (unlikely(!vm)) {
4121	WARN(`1`, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
4122	return NULL;
4123	}
4124
4125	alloced_size = get_vm_area_size(area: vm);
4126	old_size = vm->requested_size;
4127	if (WARN(alloced_size < old_size,
4128	"vrealloc() has mismatched area vs requested sizes (%p)\n", p))
4129	return NULL;
4130	}
4131
4132	/*
4133	* TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
4134	* would be a good heuristic for when to shrink the vm_area?
4135	*/
4136	if (size <= old_size) {
4137	/ Zero out "freed" memory, potentially for future realloc. /
4138	if (want_init_on_free() \|\| want_init_on_alloc(flags))
4139	memset((void *)p + size, `0`, old_size - size);
4140	vm->requested_size = size;
4141	kasan_poison_vmalloc(start: p + size, size: old_size - size);
4142	return (void *)p;
4143	}
4144
4145	/*
4146	* We already have the bytes available in the allocation; use them.
4147	*/
4148	if (size <= alloced_size) {
4149	kasan_unpoison_vmalloc(start: p + old_size, size: size - old_size,
4150	KASAN_VMALLOC_PROT_NORMAL);
4151	/*
4152	* No need to zero memory here, as unused memory will have
4153	* already been zeroed at initial allocation time or during
4154	* realloc shrink time.
4155	*/
4156	vm->requested_size = size;
4157	return (void *)p;
4158	}
4159
4160	/ TODO: Grow the vm_area, i.e. allocate and map additional pages. /
4161	n = __vmalloc_noprof(size, flags);
4162	if (!n)
4163	return NULL;
4164
4165	if (p) {
4166	memcpy(n, p, old_size);
4167	vfree(p);
4168	}
4169
4170	return n;
4171	}
4172
4173	#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
4174	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
4175	#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
4176	#define GFP_VMALLOC32 (GFP_DMA \| GFP_KERNEL)
4177	#else
4178	/*
4179	* 64b systems should always have either DMA or DMA32 zones. For others
4180	* GFP_DMA32 should do the right thing and use the normal zone.
4181	*/
4182	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
4183	#endif
4184
4185	/**
4186	* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
4187	* @size: allocation size
4188	*
4189	* Allocate enough 32bit PA addressable pages to cover @size from the
4190	* page level allocator and map them into contiguous kernel virtual space.
4191	*
4192	* Return: pointer to the allocated memory or %NULL on error
4193	*/
4194	void vmalloc_32_noprof(unsigned* long size)
4195	{
4196	return __vmalloc_node_noprof(size, align: `1`, GFP_VMALLOC32, NUMA_NO_NODE,
4197	caller: __builtin_return_address(`0`));
4198	}
4199	EXPORT_SYMBOL(vmalloc_32_noprof);
4200
4201	/**
4202	* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
4203	* @size: allocation size
4204	*
4205	* The resulting memory area is 32bit addressable and zeroed so it can be
4206	* mapped to userspace without leaking data.
4207	*
4208	* Return: pointer to the allocated memory or %NULL on error
4209	*/
4210	void vmalloc_32_user_noprof(unsigned* long size)
4211	{
4212	return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
4213	GFP_VMALLOC32 \| __GFP_ZERO, PAGE_KERNEL,
4214	VM_USERMAP, NUMA_NO_NODE,
4215	caller: __builtin_return_address(`0`));
4216	}
4217	EXPORT_SYMBOL(vmalloc_32_user_noprof);
4218
4219	/*
4220	* Atomically zero bytes in the iterator.
4221	*
4222	* Returns the number of zeroed bytes.
4223	*/
4224	static size_t zero_iter(struct iov_iter *iter, size_t count)
4225	{
4226	size_t remains = count;
4227
4228	while (remains > `0`) {
4229	size_t num, copied;
4230
4231	num = min_t(size_t, remains, PAGE_SIZE);
4232	copied = copy_page_to_iter_nofault(ZERO_PAGE(`0`), offset: `0`, bytes: num, i: iter);
4233	remains -= copied;
4234
4235	if (copied < num)
4236	break;
4237	}
4238
4239	return count - remains;
4240	}
4241
4242	/*
4243	* small helper routine, copy contents to iter from addr.
4244	* If the page is not present, fill zero.
4245	*
4246	* Returns the number of copied bytes.
4247	*/
4248	static size_t aligned_vread_iter(struct iov_iter *iter,
4249	const char *addr, size_t count)
4250	{
4251	size_t remains = count;
4252	struct page *page;
4253
4254	while (remains > `0`) {
4255	unsigned long offset, length;
4256	size_t copied = `0`;
4257
4258	offset = offset_in_page(addr);
4259	length = PAGE_SIZE - offset;
4260	if (length > remains)
4261	length = remains;
4262	page = vmalloc_to_page(addr);
4263	/*
4264	* To do safe access to this _mapped_ area, we need lock. But
4265	* adding lock here means that we need to add overhead of
4266	* vmalloc()/vfree() calls for this _debug_ interface, rarely
4267	* used. Instead of that, we'll use an local mapping via
4268	* copy_page_to_iter_nofault() and accept a small overhead in
4269	* this access function.
4270	*/
4271	if (page)
4272	copied = copy_page_to_iter_nofault(page, offset,
4273	bytes: length, i: iter);
4274	else
4275	copied = zero_iter(iter, count: length);
4276
4277	addr += copied;
4278	remains -= copied;
4279
4280	if (copied != length)
4281	break;
4282	}
4283
4284	return count - remains;
4285	}
4286
4287	/*
4288	* Read from a vm_map_ram region of memory.
4289	*
4290	* Returns the number of copied bytes.
4291	*/
4292	static size_t vmap_ram_vread_iter(struct iov_iter iter, const* char *addr,
4293	size_t count, unsigned long flags)
4294	{
4295	char *start;
4296	struct vmap_block *vb;
4297	struct xarray *xa;
4298	unsigned long offset;
4299	unsigned int rs, re;
4300	size_t remains, n;
4301
4302	/*
4303	* If it's area created by vm_map_ram() interface directly, but
4304	* not further subdividing and delegating management to vmap_block,
4305	* handle it here.
4306	*/
4307	if (!(flags & VMAP_BLOCK))
4308	return aligned_vread_iter(iter, addr, count);
4309
4310	remains = count;
4311
4312	/*
4313	* Area is split into regions and tracked with vmap_block, read out
4314	* each region and zero fill the hole between regions.
4315	*/
4316	xa = addr_to_vb_xa(addr: (unsigned long) addr);
4317	vb = xa_load(xa, index: addr_to_vb_idx(addr: (unsigned long)addr));
4318	if (!vb)
4319	goto finished_zero;
4320
4321	spin_lock(lock: &vb->lock);
4322	if (bitmap_empty(src: vb->used_map, VMAP_BBMAP_BITS)) {
4323	spin_unlock(lock: &vb->lock);
4324	goto finished_zero;
4325	}
4326
4327	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
4328	size_t copied;
4329
4330	if (remains == `0`)
4331	goto finished;
4332
4333	start = vmap_block_vaddr(va_start: vb->va->va_start, pages_off: rs);
4334
4335	if (addr < start) {
4336	size_t to_zero = min_t(size_t, start - addr, remains);
4337	size_t zeroed = zero_iter(iter, count: to_zero);
4338
4339	addr += zeroed;
4340	remains -= zeroed;
4341
4342	if (remains == `0` \|\| zeroed != to_zero)
4343	goto finished;
4344	}
4345
4346	/it could start reading from the middle of used region/
4347	offset = offset_in_page(addr);
4348	n = ((re - rs + `1`) << PAGE_SHIFT) - offset;
4349	if (n > remains)
4350	n = remains;
4351
4352	copied = aligned_vread_iter(iter, addr: start + offset, count: n);
4353
4354	addr += copied;
4355	remains -= copied;
4356
4357	if (copied != n)
4358	goto finished;
4359	}
4360
4361	spin_unlock(lock: &vb->lock);
4362
4363	finished_zero:
4364	/ zero-fill the left dirty or free regions /
4365	return count - remains + zero_iter(iter, count: remains);
4366	finished:
4367	/ We couldn't copy/zero everything /
4368	spin_unlock(lock: &vb->lock);
4369	return count - remains;
4370	}
4371
4372	/**
4373	* vread_iter() - read vmalloc area in a safe way to an iterator.
4374	* @iter: the iterator to which data should be written.
4375	* @addr: vm address.
4376	* @count: number of bytes to be read.
4377	*
4378	* This function checks that addr is a valid vmalloc'ed area, and
4379	* copy data from that area to a given buffer. If the given memory range
4380	* of [addr...addr+count) includes some valid address, data is copied to
4381	* proper area of @buf. If there are memory holes, they'll be zero-filled.
4382	* IOREMAP area is treated as memory hole and no copy is done.
4383	*
4384	* If [addr...addr+count) doesn't includes any intersects with alive
4385	* vm_struct area, returns 0. @buf should be kernel's buffer.
4386	*
4387	* Note: In usual ops, vread() is never necessary because the caller
4388	* should know vmalloc() area is valid and can use memcpy().
4389	* This is for routines which have to access vmalloc area without
4390	* any information, as /proc/kcore.
4391	*
4392	* Return: number of bytes for which addr and buf should be increased
4393	* (same number as @count) or %0 if [addr...addr+count) doesn't
4394	* include any intersection with valid vmalloc area
4395	*/
4396	long vread_iter(struct iov_iter iter, const* char *addr, size_t count)
4397	{
4398	struct vmap_node *vn;
4399	struct vmap_area *va;
4400	struct vm_struct *vm;
4401	char *vaddr;
4402	size_t n, size, flags, remains;
4403	unsigned long next;
4404
4405	addr = kasan_reset_tag(addr);
4406
4407	/ Don't allow overflow /
4408	if ((unsigned long) addr + count < count)
4409	count = -(unsigned long) addr;
4410
4411	remains = count;
4412
4413	vn = find_vmap_area_exceed_addr_lock(addr: (unsigned long) addr, va: &va);
4414	if (!vn)
4415	goto finished_zero;
4416
4417	/ no intersects with alive vmap_area /
4418	if ((unsigned long)addr + remains <= va->va_start)
4419	goto finished_zero;
4420
4421	do {
4422	size_t copied;
4423
4424	if (remains == `0`)
4425	goto finished;
4426
4427	vm = va->vm;
4428	flags = va->flags & VMAP_FLAGS_MASK;
4429	/*
4430	* VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
4431	* be set together with VMAP_RAM.
4432	*/
4433	WARN_ON(flags == VMAP_BLOCK);
4434
4435	if (!vm && !flags)
4436	goto next_va;
4437
4438	if (vm && (vm->flags & VM_UNINITIALIZED))
4439	goto next_va;
4440
4441	/ Pair with smp_wmb() in clear_vm_uninitialized_flag() /
4442	smp_rmb();
4443
4444	vaddr = (char *) va->va_start;
4445	size = vm ? get_vm_area_size(area: vm) : va_size(va);
4446
4447	if (addr >= vaddr + size)
4448	goto next_va;
4449
4450	if (addr < vaddr) {
4451	size_t to_zero = min_t(size_t, vaddr - addr, remains);
4452	size_t zeroed = zero_iter(iter, count: to_zero);
4453
4454	addr += zeroed;
4455	remains -= zeroed;
4456
4457	if (remains == `0` \|\| zeroed != to_zero)
4458	goto finished;
4459	}
4460
4461	n = vaddr + size - addr;
4462	if (n > remains)
4463	n = remains;
4464
4465	if (flags & VMAP_RAM)
4466	copied = vmap_ram_vread_iter(iter, addr, count: n, flags);
4467	else if (!(vm && (vm->flags & (VM_IOREMAP \| VM_SPARSE))))
4468	copied = aligned_vread_iter(iter, addr, count: n);
4469	else / IOREMAP \| SPARSE area is treated as memory hole /
4470	copied = zero_iter(iter, count: n);
4471
4472	addr += copied;
4473	remains -= copied;
4474
4475	if (copied != n)
4476	goto finished;
4477
4478	next_va:
4479	next = va->va_end;
4480	spin_unlock(lock: &vn->busy.lock);
4481	} while ((vn = find_vmap_area_exceed_addr_lock(addr: next, va: &va)));
4482
4483	finished_zero:
4484	if (vn)
4485	spin_unlock(lock: &vn->busy.lock);
4486
4487	/ zero-fill memory holes /
4488	return count - remains + zero_iter(iter, count: remains);
4489	finished:
4490	/ Nothing remains, or We couldn't copy/zero everything. /
4491	if (vn)
4492	spin_unlock(lock: &vn->busy.lock);
4493
4494	return count - remains;
4495	}
4496
4497	/**
4498	* remap_vmalloc_range_partial - map vmalloc pages to userspace
4499	* @vma: vma to cover
4500	* @uaddr: target user address to start at
4501	* @kaddr: virtual address of vmalloc kernel memory
4502	* @pgoff: offset from @kaddr to start at
4503	* @size: size of map area
4504	*
4505	* Returns: 0 for success, -Exxx on failure
4506	*
4507	* This function checks that @kaddr is a valid vmalloc'ed area,
4508	* and that it is big enough to cover the range starting at
4509	* @uaddr in @vma. Will return failure if that criteria isn't
4510	* met.
4511	*
4512	* Similar to remap_pfn_range() (see mm/memory.c)
4513	*/
4514	int remap_vmalloc_range_partial(struct vm_area_struct vma, unsigned* long uaddr,
4515	void kaddr, unsigned* long pgoff,
4516	unsigned long size)
4517	{
4518	struct vm_struct *area;
4519	unsigned long off;
4520	unsigned long end_index;
4521
4522	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
4523	return -EINVAL;
4524
4525	size = PAGE_ALIGN(size);
4526
4527	if (!PAGE_ALIGNED(uaddr) \|\| !PAGE_ALIGNED(kaddr))
4528	return -EINVAL;
4529
4530	area = find_vm_area(addr: kaddr);
4531	if (!area)
4532	return -EINVAL;
4533
4534	if (!(area->flags & (VM_USERMAP \| VM_DMA_COHERENT)))
4535	return -EINVAL;
4536
4537	if (check_add_overflow(size, off, &end_index) \|\|
4538	end_index > get_vm_area_size(area))
4539	return -EINVAL;
4540	kaddr += off;
4541
4542	do {
4543	struct page *page = vmalloc_to_page(kaddr);
4544	int ret;
4545
4546	ret = vm_insert_page(vma, addr: uaddr, page);
4547	if (ret)
4548	return ret;
4549
4550	uaddr += PAGE_SIZE;
4551	kaddr += PAGE_SIZE;
4552	size -= PAGE_SIZE;
4553	} while (size > `0`);
4554
4555	vm_flags_set(vma, VM_DONTEXPAND \| VM_DONTDUMP);
4556
4557	return `0`;
4558	}
4559
4560	/**
4561	* remap_vmalloc_range - map vmalloc pages to userspace
4562	* @vma: vma to cover (map full range of vma)
4563	* @addr: vmalloc memory
4564	* @pgoff: number of pages into addr before first page to map
4565	*
4566	* Returns: 0 for success, -Exxx on failure
4567	*
4568	* This function checks that addr is a valid vmalloc'ed area, and
4569	* that it is big enough to cover the vma. Will return failure if
4570	* that criteria isn't met.
4571	*
4572	* Similar to remap_pfn_range() (see mm/memory.c)
4573	*/
4574	int remap_vmalloc_range(struct vm_area_struct vma, void* *addr,
4575	unsigned long pgoff)
4576	{
4577	return remap_vmalloc_range_partial(vma, uaddr: vma->vm_start,
4578	kaddr: addr, pgoff,
4579	size: vma->vm_end - vma->vm_start);
4580	}
4581	EXPORT_SYMBOL(remap_vmalloc_range);
4582
4583	void free_vm_area(struct vm_struct *area)
4584	{
4585	struct vm_struct *ret;
4586	ret = remove_vm_area(addr: area->addr);
4587	BUG_ON(ret != area);
4588	kfree(objp: area);
4589	}
4590	EXPORT_SYMBOL_GPL(free_vm_area);
4591
4592	#ifdef CONFIG_SMP
4593	static struct vmap_area node_to_va(struct* rb_node *n)
4594	{
4595	return rb_entry_safe(n, struct vmap_area, rb_node);
4596	}
4597
4598	/**
4599	* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
4600	* @addr: target address
4601	*
4602	* Returns: vmap_area if it is found. If there is no such area
4603	* the first highest(reverse order) vmap_area is returned
4604	* i.e. va->va_start < addr && va->va_end < addr or NULL
4605	* if there are no any areas before @addr.
4606	*/
4607	static struct vmap_area *
4608	pvm_find_va_enclose_addr(unsigned long addr)
4609	{
4610	struct vmap_area va, tmp;
4611	struct rb_node *n;
4612
4613	n = free_vmap_area_root.rb_node;
4614	va = NULL;
4615
4616	while (n) {
4617	tmp = rb_entry(n, struct vmap_area, rb_node);
4618	if (tmp->va_start <= addr) {
4619	va = tmp;
4620	if (tmp->va_end >= addr)
4621	break;
4622
4623	n = n->rb_right;
4624	} else {
4625	n = n->rb_left;
4626	}
4627	}
4628
4629	return va;
4630	}
4631
4632	/**
4633	* pvm_determine_end_from_reverse - find the highest aligned address
4634	* of free block below VMALLOC_END
4635	* @va:
4636	* in - the VA we start the search(reverse order);
4637	* out - the VA with the highest aligned end address.
4638	* @align: alignment for required highest address
4639	*
4640	* Returns: determined end address within vmap_area
4641	*/
4642	static unsigned long
4643	pvm_determine_end_from_reverse(struct vmap_area *va, unsigned* long align)
4644	{
4645	unsigned long vmalloc_end = VMALLOC_END & ~(align - `1`);
4646	unsigned long addr;
4647
4648	if (likely(*va)) {
4649	list_for_each_entry_from_reverse((*va),
4650	&free_vmap_area_list, list) {
4651	addr = min((*va)->va_end & ~(align - `1`), vmalloc_end);
4652	if ((*va)->va_start < addr)
4653	return addr;
4654	}
4655	}
4656
4657	return `0`;
4658	}
4659
4660	/**
4661	* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
4662	* @offsets: array containing offset of each area
4663	* @sizes: array containing size of each area
4664	* @nr_vms: the number of areas to allocate
4665	* @align: alignment, all entries in @offsets and @sizes must be aligned to this
4666	*
4667	* Returns: kmalloc'd vm_struct pointer array pointing to allocated
4668	* vm_structs on success, %NULL on failure
4669	*
4670	* Percpu allocator wants to use congruent vm areas so that it can
4671	* maintain the offsets among percpu areas. This function allocates
4672	* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
4673	* be scattered pretty far, distance between two areas easily going up
4674	* to gigabytes. To avoid interacting with regular vmallocs, these
4675	* areas are allocated from top.
4676	*
4677	* Despite its complicated look, this allocator is rather simple. It
4678	* does everything top-down and scans free blocks from the end looking
4679	* for matching base. While scanning, if any of the areas do not fit the
4680	* base address is pulled down to fit the area. Scanning is repeated till
4681	* all the areas fit and then all necessary data structures are inserted
4682	* and the result is returned.
4683	*/
4684	struct vm_struct *pcpu_get_vm_areas(const* unsigned long *offsets,
4685	const size_t sizes, int* nr_vms,
4686	size_t align)
4687	{
4688	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
4689	const unsigned long vmalloc_end = VMALLOC_END & ~(align - `1`);
4690	struct vmap_area *vas, va;
4691	struct vm_struct **vms;
4692	int area, area2, last_area, term_area;
4693	unsigned long base, start, size, end, last_end, orig_start, orig_end;
4694	bool purged = false;
4695
4696	/ verify parameters and allocate data structures /
4697	BUG_ON(offset_in_page(align) \|\| !is_power_of_2(align));
4698	for (last_area = `0`, area = `0`; area < nr_vms; area++) {
4699	start = offsets[area];
4700	end = start + sizes[area];
4701
4702	/ is everything aligned properly? /
4703	BUG_ON(!IS_ALIGNED(offsets[area], align));
4704	BUG_ON(!IS_ALIGNED(sizes[area], align));
4705
4706	/ detect the area with the highest address /
4707	if (start > offsets[last_area])
4708	last_area = area;
4709
4710	for (area2 = area + `1`; area2 < nr_vms; area2++) {
4711	unsigned long start2 = offsets[area2];
4712	unsigned long end2 = start2 + sizes[area2];
4713
4714	BUG_ON(start2 < end && start < end2);
4715	}
4716	}
4717	last_end = offsets[last_area] + sizes[last_area];
4718
4719	if (vmalloc_end - vmalloc_start < last_end) {
4720	WARN_ON(true);
4721	return NULL;
4722	}
4723
4724	vms = kcalloc(nr_vms, sizeof(vms[`0`]), GFP_KERNEL);
4725	vas = kcalloc(nr_vms, sizeof(vas[`0`]), GFP_KERNEL);
4726	if (!vas \|\| !vms)
4727	goto err_free2;
4728
4729	for (area = `0`; area < nr_vms; area++) {
4730	vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
4731	vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
4732	if (!vas[area] \|\| !vms[area])
4733	goto err_free;
4734	}
4735	retry:
4736	spin_lock(lock: &free_vmap_area_lock);
4737
4738	/ start scanning - we scan from the top, begin with the last area /
4739	area = term_area = last_area;
4740	start = offsets[area];
4741	end = start + sizes[area];
4742
4743	va = pvm_find_va_enclose_addr(addr: vmalloc_end);
4744	base = pvm_determine_end_from_reverse(va: &va, align) - end;
4745
4746	while (true) {
4747	/*
4748	* base might have underflowed, add last_end before
4749	* comparing.
4750	*/
4751	if (base + last_end < vmalloc_start + last_end)
4752	goto overflow;
4753
4754	/*
4755	* Fitting base has not been found.
4756	*/
4757	if (va == NULL)
4758	goto overflow;
4759
4760	/*
4761	* If required width exceeds current VA block, move
4762	* base downwards and then recheck.
4763	*/
4764	if (base + end > va->va_end) {
4765	base = pvm_determine_end_from_reverse(va: &va, align) - end;
4766	term_area = area;
4767	continue;
4768	}
4769
4770	/*
4771	* If this VA does not fit, move base downwards and recheck.
4772	*/
4773	if (base + start < va->va_start) {
4774	va = node_to_va(n: rb_prev(&va->rb_node));
4775	base = pvm_determine_end_from_reverse(va: &va, align) - end;
4776	term_area = area;
4777	continue;
4778	}
4779
4780	/*
4781	* This area fits, move on to the previous one. If
4782	* the previous one is the terminal one, we're done.
4783	*/
4784	area = (area + nr_vms - `1`) % nr_vms;
4785	if (area == term_area)
4786	break;
4787
4788	start = offsets[area];
4789	end = start + sizes[area];
4790	va = pvm_find_va_enclose_addr(addr: base + end);
4791	}
4792
4793	/ we've found a fitting base, insert all va's /
4794	for (area = `0`; area < nr_vms; area++) {
4795	int ret;
4796
4797	start = base + offsets[area];
4798	size = sizes[area];
4799
4800	va = pvm_find_va_enclose_addr(addr: start);
4801	if (WARN_ON_ONCE(va == NULL))
4802	/ It is a BUG(), but trigger recovery instead. /
4803	goto recovery;
4804
4805	ret = va_clip(root: &free_vmap_area_root,
4806	head: &free_vmap_area_list, va, nva_start_addr: start, size);
4807	if (WARN_ON_ONCE(unlikely(ret)))
4808	/ It is a BUG(), but trigger recovery instead. /
4809	goto recovery;
4810
4811	/ Allocated area. /
4812	va = vas[area];
4813	va->va_start = start;
4814	va->va_end = start + size;
4815	}
4816
4817	spin_unlock(lock: &free_vmap_area_lock);
4818
4819	/ populate the kasan shadow space /
4820	for (area = `0`; area < nr_vms; area++) {
4821	if (kasan_populate_vmalloc(addr: vas[area]->va_start, size: sizes[area]))
4822	goto err_free_shadow;
4823	}
4824
4825	/ insert all vm's /
4826	for (area = `0`; area < nr_vms; area++) {
4827	struct vmap_node *vn = addr_to_node(addr: vas[area]->va_start);
4828
4829	spin_lock(lock: &vn->busy.lock);
4830	insert_vmap_area(va: vas[area], root: &vn->busy.root, head: &vn->busy.head);
4831	setup_vmalloc_vm(vm: vms[area], va: vas[area], VM_ALLOC,
4832	caller: pcpu_get_vm_areas);
4833	spin_unlock(lock: &vn->busy.lock);
4834	}
4835
4836	/*
4837	* Mark allocated areas as accessible. Do it now as a best-effort
4838	* approach, as they can be mapped outside of vmalloc code.
4839	* With hardware tag-based KASAN, marking is skipped for
4840	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
4841	*/
4842	for (area = `0`; area < nr_vms; area++)
4843	vms[area]->addr = kasan_unpoison_vmalloc(start: vms[area]->addr,
4844	size: vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
4845
4846	kfree(objp: vas);
4847	return vms;
4848
4849	recovery:
4850	/*
4851	* Remove previously allocated areas. There is no
4852	* need in removing these areas from the busy tree,
4853	* because they are inserted only on the final step
4854	* and when pcpu_get_vm_areas() is success.
4855	*/
4856	while (area--) {
4857	orig_start = vas[area]->va_start;
4858	orig_end = vas[area]->va_end;
4859	va = merge_or_add_vmap_area_augment(va: vas[area], root: &free_vmap_area_root,
4860	head: &free_vmap_area_list);
4861	if (va)
4862	kasan_release_vmalloc(start: orig_start, end: orig_end,
4863	free_region_start: va->va_start, free_region_end: va->va_end,
4864	KASAN_VMALLOC_PAGE_RANGE \| KASAN_VMALLOC_TLB_FLUSH);
4865	vas[area] = NULL;
4866	}
4867
4868	overflow:
4869	spin_unlock(lock: &free_vmap_area_lock);
4870	if (!purged) {
4871	reclaim_and_purge_vmap_areas();
4872	purged = true;
4873
4874	/ Before "retry", check if we recover. /
4875	for (area = `0`; area < nr_vms; area++) {
4876	if (vas[area])
4877	continue;
4878
4879	vas[area] = kmem_cache_zalloc(
4880	vmap_area_cachep, GFP_KERNEL);
4881	if (!vas[area])
4882	goto err_free;
4883	}
4884
4885	goto retry;
4886	}
4887
4888	err_free:
4889	for (area = `0`; area < nr_vms; area++) {
4890	if (vas[area])
4891	kmem_cache_free(s: vmap_area_cachep, objp: vas[area]);
4892
4893	kfree(objp: vms[area]);
4894	}
4895	err_free2:
4896	kfree(objp: vas);
4897	kfree(objp: vms);
4898	return NULL;
4899
4900	err_free_shadow:
4901	spin_lock(lock: &free_vmap_area_lock);
4902	/*
4903	* We release all the vmalloc shadows, even the ones for regions that
4904	* hadn't been successfully added. This relies on kasan_release_vmalloc
4905	* being able to tolerate this case.
4906	*/
4907	for (area = `0`; area < nr_vms; area++) {
4908	orig_start = vas[area]->va_start;
4909	orig_end = vas[area]->va_end;
4910	va = merge_or_add_vmap_area_augment(va: vas[area], root: &free_vmap_area_root,
4911	head: &free_vmap_area_list);
4912	if (va)
4913	kasan_release_vmalloc(start: orig_start, end: orig_end,
4914	free_region_start: va->va_start, free_region_end: va->va_end,
4915	KASAN_VMALLOC_PAGE_RANGE \| KASAN_VMALLOC_TLB_FLUSH);
4916	vas[area] = NULL;
4917	kfree(objp: vms[area]);
4918	}
4919	spin_unlock(lock: &free_vmap_area_lock);
4920	kfree(objp: vas);
4921	kfree(objp: vms);
4922	return NULL;
4923	}
4924
4925	/**
4926	* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
4927	* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
4928	* @nr_vms: the number of allocated areas
4929	*
4930	* Free vm_structs and the array allocated by pcpu_get_vm_areas().
4931	*/
4932	void pcpu_free_vm_areas(struct vm_struct *vms, int* nr_vms)
4933	{
4934	int i;
4935
4936	for (i = `0`; i < nr_vms; i++)
4937	free_vm_area(vms[i]);
4938	kfree(objp: vms);
4939	}
4940	#endif /* CONFIG_SMP */
4941
4942	#ifdef CONFIG_PRINTK
4943	bool vmalloc_dump_obj(void *object)
4944	{
4945	const void *caller;
4946	struct vm_struct *vm;
4947	struct vmap_area *va;
4948	struct vmap_node *vn;
4949	unsigned long addr;
4950	unsigned int nr_pages;
4951
4952	addr = PAGE_ALIGN((unsigned long) object);
4953	vn = addr_to_node(addr);
4954
4955	if (!spin_trylock(lock: &vn->busy.lock))
4956	return false;
4957
4958	va = __find_vmap_area(addr, root: &vn->busy.root);
4959	if (!va \|\| !va->vm) {
4960	spin_unlock(lock: &vn->busy.lock);
4961	return false;
4962	}
4963
4964	vm = va->vm;
4965	addr = (unsigned long) vm->addr;
4966	caller = vm->caller;
4967	nr_pages = vm->nr_pages;
4968	spin_unlock(lock: &vn->busy.lock);
4969
4970	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
4971	nr_pages, addr, caller);
4972
4973	return true;
4974	}
4975	#endif
4976
4977	#ifdef CONFIG_PROC_FS
4978
4979	/*
4980	* Print number of pages allocated on each memory node.
4981	*
4982	* This function can only be called if CONFIG_NUMA is enabled
4983	* and VM_UNINITIALIZED bit in v->flags is disabled.
4984	*/
4985	static void show_numa_info(struct seq_file m, struct* vm_struct *v,
4986	unsigned int *counters)
4987	{
4988	unsigned int nr;
4989	unsigned int step = `1U` << vm_area_page_order(vm: v);
4990
4991	if (!counters)
4992	return;
4993
4994	memset(counters, `0`, nr_node_ids * sizeof(unsigned int));
4995
4996	for (nr = `0`; nr < v->nr_pages; nr += step)
4997	counters[page_to_nid(page: v->pages[nr])] += step;
4998	for_each_node_state(nr, N_HIGH_MEMORY)
4999	if (counters[nr])
5000	seq_printf(m, fmt: " N%u=%u", nr, counters[nr]);
5001	}
5002
5003	static void show_purge_info(struct seq_file *m)
5004	{
5005	struct vmap_node *vn;
5006	struct vmap_area *va;
5007
5008	for_each_vmap_node(vn) {
5009	spin_lock(lock: &vn->lazy.lock);
5010	list_for_each_entry(va, &vn->lazy.head, list) {
5011	seq_printf(m, fmt: "0x%pK-0x%pK %7ld unpurged vm_area\n",
5012	(void )va->va_start, (void* *)va->va_end,
5013	va_size(va));
5014	}
5015	spin_unlock(lock: &vn->lazy.lock);
5016	}
5017	}
5018
5019	static int vmalloc_info_show(struct seq_file m, void* *p)
5020	{
5021	struct vmap_node *vn;
5022	struct vmap_area *va;
5023	struct vm_struct *v;
5024	unsigned int *counters;
5025
5026	if (IS_ENABLED(CONFIG_NUMA))
5027	counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
5028
5029	for_each_vmap_node(vn) {
5030	spin_lock(lock: &vn->busy.lock);
5031	list_for_each_entry(va, &vn->busy.head, list) {
5032	if (!va->vm) {
5033	if (va->flags & VMAP_RAM)
5034	seq_printf(m, fmt: "0x%pK-0x%pK %7ld vm_map_ram\n",
5035	(void )va->va_start, (void* *)va->va_end,
5036	va_size(va));
5037
5038	continue;
5039	}
5040
5041	v = va->vm;
5042	if (v->flags & VM_UNINITIALIZED)
5043	continue;
5044
5045	/ Pair with smp_wmb() in clear_vm_uninitialized_flag() /
5046	smp_rmb();
5047
5048	seq_printf(m, fmt: "0x%pK-0x%pK %7ld",
5049	v->addr, v->addr + v->size, v->size);
5050
5051	if (v->caller)
5052	seq_printf(m, fmt: " %pS", v->caller);
5053
5054	if (v->nr_pages)
5055	seq_printf(m, fmt: " pages=%d", v->nr_pages);
5056
5057	if (v->phys_addr)
5058	seq_printf(m, fmt: " phys=%pa", &v->phys_addr);
5059
5060	if (v->flags & VM_IOREMAP)
5061	seq_puts(m, s: " ioremap");
5062
5063	if (v->flags & VM_SPARSE)
5064	seq_puts(m, s: " sparse");
5065
5066	if (v->flags & VM_ALLOC)
5067	seq_puts(m, s: " vmalloc");
5068
5069	if (v->flags & VM_MAP)
5070	seq_puts(m, s: " vmap");
5071
5072	if (v->flags & VM_USERMAP)
5073	seq_puts(m, s: " user");
5074
5075	if (v->flags & VM_DMA_COHERENT)
5076	seq_puts(m, s: " dma-coherent");
5077
5078	if (is_vmalloc_addr(v->pages))
5079	seq_puts(m, s: " vpages");
5080
5081	if (IS_ENABLED(CONFIG_NUMA))
5082	show_numa_info(m, v, counters);
5083
5084	seq_putc(m, c: `'\n'`);
5085	}
5086	spin_unlock(lock: &vn->busy.lock);
5087	}
5088
5089	/*
5090	* As a final step, dump "unpurged" areas.
5091	*/
5092	show_purge_info(m);
5093	if (IS_ENABLED(CONFIG_NUMA))
5094	kfree(objp: counters);
5095	return `0`;
5096	}
5097
5098	static int __init proc_vmalloc_init(void)
5099	{
5100	proc_create_single("vmallocinfo", `0400`, NULL, vmalloc_info_show);
5101	return `0`;
5102	}
5103	module_init(proc_vmalloc_init);
5104
5105	#endif
5106
5107	static void __init vmap_init_free_space(void)
5108	{
5109	unsigned long vmap_start = `1`;
5110	const unsigned long vmap_end = ULONG_MAX;
5111	struct vmap_area *free;
5112	struct vm_struct *busy;
5113
5114	/*
5115	* B F B B B F
5116	* -\|-----\|.....\|-----\|-----\|-----\|.....\|-
5117	* \| The KVA space \|
5118	* \|<--------------------------------->\|
5119	*/
5120	for (busy = vmlist; busy; busy = busy->next) {
5121	if ((unsigned long) busy->addr - vmap_start > `0`) {
5122	free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
5123	if (!WARN_ON_ONCE(!free)) {
5124	free->va_start = vmap_start;
5125	free->va_end = (unsigned long) busy->addr;
5126
5127	insert_vmap_area_augment(va: free, NULL,
5128	root: &free_vmap_area_root,
5129	head: &free_vmap_area_list);
5130	}
5131	}
5132
5133	vmap_start = (unsigned long) busy->addr + busy->size;
5134	}
5135
5136	if (vmap_end - vmap_start > `0`) {
5137	free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
5138	if (!WARN_ON_ONCE(!free)) {
5139	free->va_start = vmap_start;
5140	free->va_end = vmap_end;
5141
5142	insert_vmap_area_augment(va: free, NULL,
5143	root: &free_vmap_area_root,
5144	head: &free_vmap_area_list);
5145	}
5146	}
5147	}
5148
5149	static void vmap_init_nodes(void)
5150	{
5151	struct vmap_node *vn;
5152	int i;
5153
5154	#if BITS_PER_LONG == 64
5155	/*
5156	* A high threshold of max nodes is fixed and bound to 128,
5157	* thus a scale factor is 1 for systems where number of cores
5158	* are less or equal to specified threshold.
5159	*
5160	* As for NUMA-aware notes. For bigger systems, for example
5161	* NUMA with multi-sockets, where we can end-up with thousands
5162	* of cores in total, a "sub-numa-clustering" should be added.
5163	*
5164	* In this case a NUMA domain is considered as a single entity
5165	* with dedicated sub-nodes in it which describe one group or
5166	* set of cores. Therefore a per-domain purging is supposed to
5167	* be added as well as a per-domain balancing.
5168	*/
5169	int n = clamp_t(unsigned int, num_possible_cpus(), `1`, `128`);
5170
5171	if (n > `1`) {
5172	vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT \| __GFP_NOWARN);
5173	if (vn) {
5174	/ Node partition is 16 pages. /
5175	vmap_zone_size = (`1` << `4`) * PAGE_SIZE;
5176	nr_vmap_nodes = n;
5177	vmap_nodes = vn;
5178	} else {
5179	pr_err("Failed to allocate an array. Disable a node layer\n");
5180	}
5181	}
5182	#endif
5183
5184	for_each_vmap_node(vn) {
5185	vn->busy.root = RB_ROOT;
5186	INIT_LIST_HEAD(list: &vn->busy.head);
5187	spin_lock_init(&vn->busy.lock);
5188
5189	vn->lazy.root = RB_ROOT;
5190	INIT_LIST_HEAD(list: &vn->lazy.head);
5191	spin_lock_init(&vn->lazy.lock);
5192
5193	for (i = `0`; i < MAX_VA_SIZE_PAGES; i++) {
5194	INIT_LIST_HEAD(list: &vn->pool[i].head);
5195	WRITE_ONCE(vn->pool[i].len, `0`);
5196	}
5197
5198	spin_lock_init(&vn->pool_lock);
5199	}
5200	}
5201
5202	static unsigned long
5203	vmap_node_shrink_count(struct shrinker shrink, struct* shrink_control *sc)
5204	{
5205	unsigned long count = `0`;
5206	struct vmap_node *vn;
5207	int i;
5208
5209	for_each_vmap_node(vn) {
5210	for (i = `0`; i < MAX_VA_SIZE_PAGES; i++)
5211	count += READ_ONCE(vn->pool[i].len);
5212	}
5213
5214	return count ? count : SHRINK_EMPTY;
5215	}
5216
5217	static unsigned long
5218	vmap_node_shrink_scan(struct shrinker shrink, struct* shrink_control *sc)
5219	{
5220	struct vmap_node *vn;
5221
5222	for_each_vmap_node(vn)
5223	decay_va_pool_node(vn, full_decay: true);
5224
5225	return SHRINK_STOP;
5226	}
5227
5228	void __init vmalloc_init(void)
5229	{
5230	struct shrinker *vmap_node_shrinker;
5231	struct vmap_area *va;
5232	struct vmap_node *vn;
5233	struct vm_struct *tmp;
5234	int i;
5235
5236	/*
5237	* Create the cache for vmap_area objects.
5238	*/
5239	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
5240
5241	for_each_possible_cpu(i) {
5242	struct vmap_block_queue *vbq;
5243	struct vfree_deferred *p;
5244
5245	vbq = &per_cpu(vmap_block_queue, i);
5246	spin_lock_init(&vbq->lock);
5247	INIT_LIST_HEAD(list: &vbq->free);
5248	p = &per_cpu(vfree_deferred, i);
5249	init_llist_head(list: &p->list);
5250	INIT_WORK(&p->wq, delayed_vfree_work);
5251	xa_init(xa: &vbq->vmap_blocks);
5252	}
5253
5254	/*
5255	* Setup nodes before importing vmlist.
5256	*/
5257	vmap_init_nodes();
5258
5259	/ Import existing vmlist entries. /
5260	for (tmp = vmlist; tmp; tmp = tmp->next) {
5261	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
5262	if (WARN_ON_ONCE(!va))
5263	continue;
5264
5265	va->va_start = (unsigned long)tmp->addr;
5266	va->va_end = va->va_start + tmp->size;
5267	va->vm = tmp;
5268
5269	vn = addr_to_node(addr: va->va_start);
5270	insert_vmap_area(va, root: &vn->busy.root, head: &vn->busy.head);
5271	}
5272
5273	/*
5274	* Now we can initialize a free vmap space.
5275	*/
5276	vmap_init_free_space();
5277	vmap_initialized = true;
5278
5279	vmap_node_shrinker = shrinker_alloc(flags: `0`, fmt: "vmap-node");
5280	if (!vmap_node_shrinker) {
5281	pr_err("Failed to allocate vmap-node shrinker!\n");
5282	return;
5283	}
5284
5285	vmap_node_shrinker->count_objects = vmap_node_shrink_count;
5286	vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
5287	shrinker_register(shrinker: vmap_node_shrinker);
5288	}
5289

source code of linux/mm/vmalloc.c