rmap.h source code [linux/include/linux/rmap.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef _LINUX_RMAP_H
3	#define _LINUX_RMAP_H
4	/*
5	* Declarations for Reverse Mapping functions in mm/rmap.c
6	*/
7
8	#include <linux/list.h>
9	#include <linux/slab.h>
10	#include <linux/mm.h>
11	#include <linux/rwsem.h>
12	#include <linux/memcontrol.h>
13	#include <linux/highmem.h>
14	#include <linux/pagemap.h>
15	#include <linux/memremap.h>
16	#include <linux/bit_spinlock.h>
17
18	/*
19	* The anon_vma heads a list of private "related" vmas, to scan if
20	* an anonymous page pointing to this anon_vma needs to be unmapped:
21	* the vmas on the list will be related by forking, or by splitting.
22	*
23	* Since vmas come and go as they are split and merged (particularly
24	* in mprotect), the mapping field of an anonymous page cannot point
25	* directly to a vma: instead it points to an anon_vma, on whose list
26	* the related vmas can be easily linked or unlinked.
27	*
28	* After unlinking the last vma on the list, we must garbage collect
29	* the anon_vma object itself: we're guaranteed no page can be
30	* pointing to this anon_vma once its vma list is empty.
31	*/
32	struct anon_vma {
33	struct anon_vma root; /* Root of this anon_vma tree /
34	struct rw_semaphore rwsem; / W: modification, R: walking the list /
35	/*
36	* The refcount is taken on an anon_vma when there is no
37	* guarantee that the vma of page tables will exist for
38	* the duration of the operation. A caller that takes
39	* the reference is responsible for clearing up the
40	* anon_vma if they are the last user on release
41	*/
42	atomic_t refcount;
43
44	/*
45	* Count of child anon_vmas. Equals to the count of all anon_vmas that
46	* have ->parent pointing to this one, including itself.
47	*
48	* This counter is used for making decision about reusing anon_vma
49	* instead of forking new one. See comments in function anon_vma_clone.
50	*/
51	unsigned long num_children;
52	/ Count of VMAs whose ->anon_vma pointer points to this object. /
53	unsigned long num_active_vmas;
54
55	struct anon_vma parent; /* Parent of this anon_vma /
56
57	/*
58	* NOTE: the LSB of the rb_root.rb_node is set by
59	* mm_take_all_locks() _after_ taking the above lock. So the
60	* rb_root must only be read/written after taking the above lock
61	* to be sure to see a valid next pointer. The LSB bit itself
62	* is serialized by a system wide lock only visible to
63	* mm_take_all_locks() (mm_all_locks_mutex).
64	*/
65
66	/ Interval tree of private "related" vmas /
67	struct rb_root_cached rb_root;
68	};
69
70	/*
71	* The copy-on-write semantics of fork mean that an anon_vma
72	* can become associated with multiple processes. Furthermore,
73	* each child process will have its own anon_vma, where new
74	* pages for that process are instantiated.
75	*
76	* This structure allows us to find the anon_vmas associated
77	* with a VMA, or the VMAs associated with an anon_vma.
78	* The "same_vma" list contains the anon_vma_chains linking
79	* all the anon_vmas associated with this VMA.
80	* The "rb" field indexes on an interval tree the anon_vma_chains
81	* which link all the VMAs associated with this anon_vma.
82	*/
83	struct anon_vma_chain {
84	struct vm_area_struct *vma;
85	struct anon_vma *anon_vma;
86	struct list_head same_vma; / locked by mmap_lock & page_table_lock /
87	struct rb_node rb; / locked by anon_vma->rwsem /
88	unsigned long rb_subtree_last;
89	#ifdef CONFIG_DEBUG_VM_RB
90	unsigned long cached_vma_start, cached_vma_last;
91	#endif
92	};
93
94	enum ttu_flags {
95	TTU_SPLIT_HUGE_PMD = `0x4`, / split huge PMD if any /
96	TTU_IGNORE_MLOCK = `0x8`, / ignore mlock /
97	TTU_SYNC = `0x10`, / avoid racy checks with PVMW_SYNC /
98	TTU_HWPOISON = `0x20`, / do convert pte to hwpoison entry /
99	TTU_BATCH_FLUSH = `0x40`, / Batch TLB flushes where possible*
100	* and caller guarantees they will
101	* do a final flush if necessary */
102	TTU_RMAP_LOCKED = `0x80`, / do not grab rmap lock:*
103	* caller holds it */
104	};
105
106	#ifdef CONFIG_MMU
107	static inline void get_anon_vma(struct anon_vma *anon_vma)
108	{
109	atomic_inc(v: &anon_vma->refcount);
110	}
111
112	void __put_anon_vma(struct anon_vma *anon_vma);
113
114	static inline void put_anon_vma(struct anon_vma *anon_vma)
115	{
116	if (atomic_dec_and_test(v: &anon_vma->refcount))
117	__put_anon_vma(anon_vma);
118	}
119
120	static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
121	{
122	down_write(sem: &anon_vma->root->rwsem);
123	}
124
125	static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
126	{
127	return down_write_trylock(sem: &anon_vma->root->rwsem);
128	}
129
130	static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
131	{
132	up_write(sem: &anon_vma->root->rwsem);
133	}
134
135	static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
136	{
137	down_read(sem: &anon_vma->root->rwsem);
138	}
139
140	static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
141	{
142	return down_read_trylock(sem: &anon_vma->root->rwsem);
143	}
144
145	static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
146	{
147	up_read(sem: &anon_vma->root->rwsem);
148	}
149
150
151	/*
152	* anon_vma helper functions.
153	*/
154	void anon_vma_init(void); / create anon_vma_cachep /
155	int __anon_vma_prepare(struct vm_area_struct *);
156	void unlink_anon_vmas(struct vm_area_struct *);
157	int anon_vma_clone(struct vm_area_struct , struct* vm_area_struct *);
158	int anon_vma_fork(struct vm_area_struct , struct* vm_area_struct *);
159
160	static inline int anon_vma_prepare(struct vm_area_struct *vma)
161	{
162	if (likely(vma->anon_vma))
163	return `0`;
164
165	return __anon_vma_prepare(vma);
166	}
167
168	static inline void anon_vma_merge(struct vm_area_struct *vma,
169	struct vm_area_struct *next)
170	{
171	VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
172	unlink_anon_vmas(next);
173	}
174
175	struct anon_vma folio_get_anon_vma(const* struct folio *folio);
176
177	#ifdef CONFIG_MM_ID
178	static __always_inline void folio_lock_large_mapcount(struct folio *folio)
179	{
180	bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, addr: &folio->_mm_ids);
181	}
182
183	static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
184	{
185	__bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, addr: &folio->_mm_ids);
186	}
187
188	static inline unsigned int folio_mm_id(const struct folio folio, int* idx)
189	{
190	VM_WARN_ON_ONCE(idx != `0` && idx != `1`);
191	return folio->_mm_id[idx] & MM_ID_MASK;
192	}
193
194	static inline void folio_set_mm_id(struct folio folio, int* idx, mm_id_t id)
195	{
196	VM_WARN_ON_ONCE(idx != `0` && idx != `1`);
197	folio->_mm_id[idx] &= ~MM_ID_MASK;
198	folio->_mm_id[idx] \|= id;
199	}
200
201	static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
202	int diff, mm_id_t mm_id)
203	{
204	VM_WARN_ON_ONCE(!folio_test_large(folio) \|\| folio_test_hugetlb(folio));
205	VM_WARN_ON_ONCE(diff <= `0`);
206	VM_WARN_ON_ONCE(mm_id < MM_ID_MIN \|\| mm_id > MM_ID_MAX);
207
208	/*
209	* Make sure we can detect at least one complete PTE mapping of the
210	* folio in a single MM as "exclusively mapped". This is primarily
211	* a check on 32bit, where we currently reduce the size of the per-MM
212	* mapcount to a short.
213	*/
214	VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
215	VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - `1` > MM_ID_MAPCOUNT_MAX);
216
217	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) == MM_ID_DUMMY &&
218	folio->_mm_id_mapcount[`0`] != -`1`);
219	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) != MM_ID_DUMMY &&
220	folio->_mm_id_mapcount[`0`] < `0`);
221	VM_WARN_ON_ONCE(folio_mm_id(folio, `1`) == MM_ID_DUMMY &&
222	folio->_mm_id_mapcount[`1`] != -`1`);
223	VM_WARN_ON_ONCE(folio_mm_id(folio, `1`) != MM_ID_DUMMY &&
224	folio->_mm_id_mapcount[`1`] < `0`);
225	VM_WARN_ON_ONCE(!folio_mapped(folio) &&
226	test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
227	}
228
229	static __always_inline void folio_set_large_mapcount(struct folio *folio,
230	int mapcount, struct vm_area_struct *vma)
231	{
232	__folio_large_mapcount_sanity_checks(folio, diff: mapcount, mm_id: vma->vm_mm->mm_id);
233
234	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) != MM_ID_DUMMY);
235	VM_WARN_ON_ONCE(folio_mm_id(folio, `1`) != MM_ID_DUMMY);
236
237	/ Note: mapcounts start at -1. /
238	atomic_set(v: &folio->_large_mapcount, i: mapcount - `1`);
239	folio->_mm_id_mapcount[`0`] = mapcount - `1`;
240	folio_set_mm_id(folio, idx: `0`, id: vma->vm_mm->mm_id);
241	}
242
243	static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
244	int diff, struct vm_area_struct *vma)
245	{
246	const mm_id_t mm_id = vma->vm_mm->mm_id;
247	int new_mapcount_val;
248
249	folio_lock_large_mapcount(folio);
250	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
251
252	new_mapcount_val = atomic_read(v: &folio->_large_mapcount) + diff;
253	atomic_set(v: &folio->_large_mapcount, i: new_mapcount_val);
254
255	/*
256	* If a folio is mapped more than once into an MM on 32bit, we
257	* can in theory overflow the per-MM mapcount (although only for
258	* fairly large folios), turning it negative. In that case, just
259	* free up the slot and mark the folio "mapped shared", otherwise
260	* we might be in trouble when unmapping pages later.
261	*/
262	if (folio_mm_id(folio, idx: `0`) == mm_id) {
263	folio->_mm_id_mapcount[`0`] += diff;
264	if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[`0`] < `0`)) {
265	folio->_mm_id_mapcount[`0`] = -`1`;
266	folio_set_mm_id(folio, idx: `0`, MM_ID_DUMMY);
267	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
268	}
269	} else if (folio_mm_id(folio, idx: `1`) == mm_id) {
270	folio->_mm_id_mapcount[`1`] += diff;
271	if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[`1`] < `0`)) {
272	folio->_mm_id_mapcount[`1`] = -`1`;
273	folio_set_mm_id(folio, idx: `1`, MM_ID_DUMMY);
274	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
275	}
276	} else if (folio_mm_id(folio, idx: `0`) == MM_ID_DUMMY) {
277	folio_set_mm_id(folio, idx: `0`, id: mm_id);
278	folio->_mm_id_mapcount[`0`] = diff - `1`;
279	/ We might have other mappings already. /
280	if (new_mapcount_val != diff - `1`)
281	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
282	} else if (folio_mm_id(folio, idx: `1`) == MM_ID_DUMMY) {
283	folio_set_mm_id(folio, idx: `1`, id: mm_id);
284	folio->_mm_id_mapcount[`1`] = diff - `1`;
285	/ Slot 0 certainly has mappings as well. /
286	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
287	}
288	folio_unlock_large_mapcount(folio);
289	return new_mapcount_val + `1`;
290	}
291	#define folio_add_large_mapcount folio_add_return_large_mapcount
292
293	static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
294	int diff, struct vm_area_struct *vma)
295	{
296	const mm_id_t mm_id = vma->vm_mm->mm_id;
297	int new_mapcount_val;
298
299	folio_lock_large_mapcount(folio);
300	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
301
302	new_mapcount_val = atomic_read(v: &folio->_large_mapcount) - diff;
303	atomic_set(v: &folio->_large_mapcount, i: new_mapcount_val);
304
305	/*
306	* There are valid corner cases where we might underflow a per-MM
307	* mapcount (some mappings added when no slot was free, some mappings
308	* added once a slot was free), so we always set it to -1 once we go
309	* negative.
310	*/
311	if (folio_mm_id(folio, idx: `0`) == mm_id) {
312	folio->_mm_id_mapcount[`0`] -= diff;
313	if (folio->_mm_id_mapcount[`0`] >= `0`)
314	goto out;
315	folio->_mm_id_mapcount[`0`] = -`1`;
316	folio_set_mm_id(folio, idx: `0`, MM_ID_DUMMY);
317	} else if (folio_mm_id(folio, idx: `1`) == mm_id) {
318	folio->_mm_id_mapcount[`1`] -= diff;
319	if (folio->_mm_id_mapcount[`1`] >= `0`)
320	goto out;
321	folio->_mm_id_mapcount[`1`] = -`1`;
322	folio_set_mm_id(folio, idx: `1`, MM_ID_DUMMY);
323	}
324
325	/*
326	* If one MM slot owns all mappings, the folio is mapped exclusively.
327	* Note that if the folio is now unmapped (new_mapcount_val == -1), both
328	* slots must be free (mapcount == -1), and we'll also mark it as
329	* exclusive.
330	*/
331	if (folio->_mm_id_mapcount[`0`] == new_mapcount_val \|\|
332	folio->_mm_id_mapcount[`1`] == new_mapcount_val)
333	folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
334	out:
335	folio_unlock_large_mapcount(folio);
336	return new_mapcount_val + `1`;
337	}
338	#define folio_sub_large_mapcount folio_sub_return_large_mapcount
339	#else /* !CONFIG_MM_ID */
340	/*
341	* See __folio_rmap_sanity_checks(), we might map large folios even without
342	* CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
343	*/
344	static inline void folio_set_large_mapcount(struct folio folio, int* mapcount,
345	struct vm_area_struct *vma)
346	{
347	/ Note: mapcounts start at -1. /
348	atomic_set(&folio->_large_mapcount, mapcount - `1`);
349	}
350
351	static inline void folio_add_large_mapcount(struct folio *folio,
352	int diff, struct vm_area_struct *vma)
353	{
354	atomic_add(diff, &folio->_large_mapcount);
355	}
356
357	static inline int folio_add_return_large_mapcount(struct folio *folio,
358	int diff, struct vm_area_struct *vma)
359	{
360	BUILD_BUG();
361	}
362
363	static inline void folio_sub_large_mapcount(struct folio *folio,
364	int diff, struct vm_area_struct *vma)
365	{
366	atomic_sub(diff, &folio->_large_mapcount);
367	}
368
369	static inline int folio_sub_return_large_mapcount(struct folio *folio,
370	int diff, struct vm_area_struct *vma)
371	{
372	BUILD_BUG();
373	}
374	#endif /* CONFIG_MM_ID */
375
376	#define folio_inc_large_mapcount(folio, vma) \
377	folio_add_large_mapcount(folio, 1, vma)
378	#define folio_inc_return_large_mapcount(folio, vma) \
379	folio_add_return_large_mapcount(folio, 1, vma)
380	#define folio_dec_large_mapcount(folio, vma) \
381	folio_sub_large_mapcount(folio, 1, vma)
382	#define folio_dec_return_large_mapcount(folio, vma) \
383	folio_sub_return_large_mapcount(folio, 1, vma)
384
385	/ RMAP flags, currently only relevant for some anon rmap operations. /
386	typedef int __bitwise rmap_t;
387
388	/*
389	* No special request: A mapped anonymous (sub)page is possibly shared between
390	* processes.
391	*/
392	#define RMAP_NONE ((__force rmap_t)0)
393
394	/ The anonymous (sub)page is exclusive to a single process. /
395	#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
396
397	/*
398	* Internally, we're using an enum to specify the granularity. We make the
399	* compiler emit specialized code for each granularity.
400	*/
401	enum rmap_level {
402	RMAP_LEVEL_PTE = `0`,
403	RMAP_LEVEL_PMD,
404	RMAP_LEVEL_PUD,
405	};
406
407	static inline void __folio_rmap_sanity_checks(const struct folio *folio,
408	const struct page page, int* nr_pages, enum rmap_level level)
409	{
410	/ hugetlb folios are handled separately. /
411	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
412
413	/ When (un)mapping zeropages, we should never touch ref+mapcount. /
414	VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);
415
416	/*
417	* TODO: we get driver-allocated folios that have nothing to do with
418	* the rmap using vm_insert_page(); therefore, we cannot assume that
419	* folio_test_large_rmappable() holds for large folios. We should
420	* handle any desired mapcount+stats accounting for these folios in
421	* VM_MIXEDMAP VMAs separately, and then sanity-check here that
422	* we really only get rmappable folios.
423	*/
424
425	VM_WARN_ON_ONCE(nr_pages <= `0`);
426	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
427	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - `1`) != folio, folio);
428
429	switch (level) {
430	case RMAP_LEVEL_PTE:
431	break;
432	case RMAP_LEVEL_PMD:
433	/*
434	* We don't support folios larger than a single PMD yet. So
435	* when RMAP_LEVEL_PMD is set, we assume that we are creating
436	* a single "entire" mapping of the folio.
437	*/
438	VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
439	VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
440	break;
441	case RMAP_LEVEL_PUD:
442	/*
443	* Assume that we are creating a single "entire" mapping of the
444	* folio.
445	*/
446	VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
447	VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
448	break;
449	default:
450	VM_WARN_ON_ONCE(true);
451	}
452	}
453
454	/*
455	* rmap interfaces called when adding or removing pte of page
456	*/
457	void folio_move_anon_rmap(struct folio , struct* vm_area_struct *);
458	void folio_add_anon_rmap_ptes(struct folio , struct* page , int* nr_pages,
459	struct vm_area_struct , unsigned* long address, rmap_t flags);
460	#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
461	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
462	void folio_add_anon_rmap_pmd(struct folio , struct* page *,
463	struct vm_area_struct , unsigned* long address, rmap_t flags);
464	void folio_add_new_anon_rmap(struct folio , struct* vm_area_struct *,
465	unsigned long address, rmap_t flags);
466	void folio_add_file_rmap_ptes(struct folio , struct* page , int* nr_pages,
467	struct vm_area_struct *);
468	#define folio_add_file_rmap_pte(folio, page, vma) \
469	folio_add_file_rmap_ptes(folio, page, 1, vma)
470	void folio_add_file_rmap_pmd(struct folio , struct* page *,
471	struct vm_area_struct *);
472	void folio_add_file_rmap_pud(struct folio , struct* page *,
473	struct vm_area_struct *);
474	void folio_remove_rmap_ptes(struct folio , struct* page , int* nr_pages,
475	struct vm_area_struct *);
476	#define folio_remove_rmap_pte(folio, page, vma) \
477	folio_remove_rmap_ptes(folio, page, 1, vma)
478	void folio_remove_rmap_pmd(struct folio , struct* page *,
479	struct vm_area_struct *);
480	void folio_remove_rmap_pud(struct folio , struct* page *,
481	struct vm_area_struct *);
482
483	void hugetlb_add_anon_rmap(struct folio , struct* vm_area_struct *,
484	unsigned long address, rmap_t flags);
485	void hugetlb_add_new_anon_rmap(struct folio , struct* vm_area_struct *,
486	unsigned long address);
487
488	/ See folio_try_dup_anon_rmap_() /*
489	static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
490	struct vm_area_struct *vma)
491	{
492	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
493	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
494
495	if (PageAnonExclusive(page: &folio->page)) {
496	if (unlikely(folio_needs_cow_for_dma(vma, folio)))
497	return -EBUSY;
498	ClearPageAnonExclusive(page: &folio->page);
499	}
500	atomic_inc(v: &folio->_entire_mapcount);
501	atomic_inc(v: &folio->_large_mapcount);
502	return `0`;
503	}
504
505	/ See folio_try_share_anon_rmap_() /*
506	static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
507	{
508	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
509	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
510	VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
511
512	/ Paired with the memory barrier in try_grab_folio(). /
513	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
514	smp_mb();
515
516	if (unlikely(folio_maybe_dma_pinned(folio)))
517	return -EBUSY;
518	ClearPageAnonExclusive(page: &folio->page);
519
520	/*
521	* This is conceptually a smp_wmb() paired with the smp_rmb() in
522	* gup_must_unshare().
523	*/
524	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
525	smp_mb__after_atomic();
526	return `0`;
527	}
528
529	static inline void hugetlb_add_file_rmap(struct folio *folio)
530	{
531	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
532	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
533
534	atomic_inc(v: &folio->_entire_mapcount);
535	atomic_inc(v: &folio->_large_mapcount);
536	}
537
538	static inline void hugetlb_remove_rmap(struct folio *folio)
539	{
540	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
541
542	atomic_dec(v: &folio->_entire_mapcount);
543	atomic_dec(v: &folio->_large_mapcount);
544	}
545
546	static __always_inline void __folio_dup_file_rmap(struct folio *folio,
547	struct page page, int* nr_pages, struct vm_area_struct *dst_vma,
548	enum rmap_level level)
549	{
550	const int orig_nr_pages = nr_pages;
551
552	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
553
554	switch (level) {
555	case RMAP_LEVEL_PTE:
556	if (!folio_test_large(folio)) {
557	atomic_inc(v: &folio->_mapcount);
558	break;
559	}
560
561	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
562	do {
563	atomic_inc(v: &page->_mapcount);
564	} while (page++, --nr_pages > `0`);
565	}
566	folio_add_large_mapcount(folio, diff: orig_nr_pages, vma: dst_vma);
567	break;
568	case RMAP_LEVEL_PMD:
569	case RMAP_LEVEL_PUD:
570	atomic_inc(v: &folio->_entire_mapcount);
571	folio_inc_large_mapcount(folio, dst_vma);
572	break;
573	}
574	}
575
576	/**
577	* folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
578	* @folio: The folio to duplicate the mappings of
579	* @page: The first page to duplicate the mappings of
580	* @nr_pages: The number of pages of which the mapping will be duplicated
581	* @dst_vma: The destination vm area
582	*
583	* The page range of the folio is defined by [page, page + nr_pages)
584	*
585	* The caller needs to hold the page table lock.
586	*/
587	static inline void folio_dup_file_rmap_ptes(struct folio *folio,
588	struct page page, int* nr_pages, struct vm_area_struct *dst_vma)
589	{
590	__folio_dup_file_rmap(folio, page, nr_pages, dst_vma, level: RMAP_LEVEL_PTE);
591	}
592
593	static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
594	struct page page, struct* vm_area_struct *dst_vma)
595	{
596	__folio_dup_file_rmap(folio, page, nr_pages: `1`, dst_vma, level: RMAP_LEVEL_PTE);
597	}
598
599	/**
600	* folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
601	* @folio: The folio to duplicate the mapping of
602	* @page: The first page to duplicate the mapping of
603	* @dst_vma: The destination vm area
604	*
605	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
606	*
607	* The caller needs to hold the page table lock.
608	*/
609	static inline void folio_dup_file_rmap_pmd(struct folio *folio,
610	struct page page, struct* vm_area_struct *dst_vma)
611	{
612	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
613	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, level: RMAP_LEVEL_PTE);
614	#else
615	WARN_ON_ONCE(true);
616	#endif
617	}
618
619	static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
620	struct page page, int* nr_pages, struct vm_area_struct *dst_vma,
621	struct vm_area_struct src_vma, enum* rmap_level level)
622	{
623	const int orig_nr_pages = nr_pages;
624	bool maybe_pinned;
625	int i;
626
627	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
628	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
629
630	/*
631	* If this folio may have been pinned by the parent process,
632	* don't allow to duplicate the mappings but instead require to e.g.,
633	* copy the subpage immediately for the child so that we'll always
634	* guarantee the pinned folio won't be randomly replaced in the
635	* future on write faults.
636	*/
637	maybe_pinned = likely(!folio_is_device_private(folio)) &&
638	unlikely(folio_needs_cow_for_dma(src_vma, folio));
639
640	/*
641	* No need to check+clear for already shared PTEs/PMDs of the
642	* folio. But if any page is PageAnonExclusive, we must fallback to
643	* copying if the folio maybe pinned.
644	*/
645	switch (level) {
646	case RMAP_LEVEL_PTE:
647	if (unlikely(maybe_pinned)) {
648	for (i = `0`; i < nr_pages; i++)
649	if (PageAnonExclusive(page: page + i))
650	return -EBUSY;
651	}
652
653	if (!folio_test_large(folio)) {
654	if (PageAnonExclusive(page))
655	ClearPageAnonExclusive(page);
656	atomic_inc(v: &folio->_mapcount);
657	break;
658	}
659
660	do {
661	if (PageAnonExclusive(page))
662	ClearPageAnonExclusive(page);
663	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
664	atomic_inc(v: &page->_mapcount);
665	} while (page++, --nr_pages > `0`);
666	folio_add_large_mapcount(folio, diff: orig_nr_pages, vma: dst_vma);
667	break;
668	case RMAP_LEVEL_PMD:
669	case RMAP_LEVEL_PUD:
670	if (PageAnonExclusive(page)) {
671	if (unlikely(maybe_pinned))
672	return -EBUSY;
673	ClearPageAnonExclusive(page);
674	}
675	atomic_inc(v: &folio->_entire_mapcount);
676	folio_inc_large_mapcount(folio, dst_vma);
677	break;
678	}
679	return `0`;
680	}
681
682	/**
683	* folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
684	* of a folio
685	* @folio: The folio to duplicate the mappings of
686	* @page: The first page to duplicate the mappings of
687	* @nr_pages: The number of pages of which the mapping will be duplicated
688	* @dst_vma: The destination vm area
689	* @src_vma: The vm area from which the mappings are duplicated
690	*
691	* The page range of the folio is defined by [page, page + nr_pages)
692	*
693	* The caller needs to hold the page table lock and the
694	* vma->vma_mm->write_protect_seq.
695	*
696	* Duplicating the mappings can only fail if the folio may be pinned; device
697	* private folios cannot get pinned and consequently this function cannot fail
698	* for them.
699	*
700	* If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
701	* the parent and the child. They must not be writable after this call
702	* succeeded.
703	*
704	* Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
705	*/
706	static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
707	struct page page, int* nr_pages, struct vm_area_struct *dst_vma,
708	struct vm_area_struct *src_vma)
709	{
710	return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
711	src_vma, level: RMAP_LEVEL_PTE);
712	}
713
714	static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
715	struct page page, struct* vm_area_struct *dst_vma,
716	struct vm_area_struct *src_vma)
717	{
718	return __folio_try_dup_anon_rmap(folio, page, nr_pages: `1`, dst_vma, src_vma,
719	level: RMAP_LEVEL_PTE);
720	}
721
722	/**
723	* folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
724	* of a folio
725	* @folio: The folio to duplicate the mapping of
726	* @page: The first page to duplicate the mapping of
727	* @dst_vma: The destination vm area
728	* @src_vma: The vm area from which the mapping is duplicated
729	*
730	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
731	*
732	* The caller needs to hold the page table lock and the
733	* vma->vma_mm->write_protect_seq.
734	*
735	* Duplicating the mapping can only fail if the folio may be pinned; device
736	* private folios cannot get pinned and consequently this function cannot fail
737	* for them.
738	*
739	* If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
740	* the parent and the child. They must not be writable after this call
741	* succeeded.
742	*
743	* Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
744	*/
745	static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
746	struct page page, struct* vm_area_struct *dst_vma,
747	struct vm_area_struct *src_vma)
748	{
749	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
750	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
751	src_vma, level: RMAP_LEVEL_PMD);
752	#else
753	WARN_ON_ONCE(true);
754	return -EBUSY;
755	#endif
756	}
757
758	static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
759	struct page page, int* nr_pages, enum rmap_level level)
760	{
761	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
762	VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
763	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
764
765	/ device private folios cannot get pinned via GUP. /
766	if (unlikely(folio_is_device_private(folio))) {
767	ClearPageAnonExclusive(page);
768	return `0`;
769	}
770
771	/*
772	* We have to make sure that when we clear PageAnonExclusive, that
773	* the page is not pinned and that concurrent GUP-fast won't succeed in
774	* concurrently pinning the page.
775	*
776	* Conceptually, PageAnonExclusive clearing consists of:
777	* (A1) Clear PTE
778	* (A2) Check if the page is pinned; back off if so.
779	* (A3) Clear PageAnonExclusive
780	* (A4) Restore PTE (optional, but certainly not writable)
781	*
782	* When clearing PageAnonExclusive, we cannot possibly map the page
783	* writable again, because anon pages that may be shared must never
784	* be writable. So in any case, if the PTE was writable it cannot
785	* be writable anymore afterwards and there would be a PTE change. Only
786	* if the PTE wasn't writable, there might not be a PTE change.
787	*
788	* Conceptually, GUP-fast pinning of an anon page consists of:
789	* (B1) Read the PTE
790	* (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
791	* (B3) Pin the mapped page
792	* (B4) Check if the PTE changed by re-reading it; back off if so.
793	* (B5) If the original PTE is not writable, check if
794	* PageAnonExclusive is not set; back off if so.
795	*
796	* If the PTE was writable, we only have to make sure that GUP-fast
797	* observes a PTE change and properly backs off.
798	*
799	* If the PTE was not writable, we have to make sure that GUP-fast either
800	* detects a (temporary) PTE change or that PageAnonExclusive is cleared
801	* and properly backs off.
802	*
803	* Consequently, when clearing PageAnonExclusive(), we have to make
804	* sure that (A1), (A2)/(A3) and (A4) happen in the right memory
805	* order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
806	* and (B5) happen in the right memory order.
807	*
808	* We assume that there might not be a memory barrier after
809	* clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
810	* so we use explicit ones here.
811	*/
812
813	/ Paired with the memory barrier in try_grab_folio(). /
814	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
815	smp_mb();
816
817	if (unlikely(folio_maybe_dma_pinned(folio)))
818	return -EBUSY;
819	ClearPageAnonExclusive(page);
820
821	/*
822	* This is conceptually a smp_wmb() paired with the smp_rmb() in
823	* gup_must_unshare().
824	*/
825	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
826	smp_mb__after_atomic();
827	return `0`;
828	}
829
830	/**
831	* folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
832	* mapped by a PTE possibly shared to prepare
833	* for KSM or temporary unmapping
834	* @folio: The folio to share a mapping of
835	* @page: The mapped exclusive page
836	*
837	* The caller needs to hold the page table lock and has to have the page table
838	* entries cleared/invalidated.
839	*
840	* This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
841	* fork() to duplicate mappings, but instead to prepare for KSM or temporarily
842	* unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
843	*
844	* Marking the mapped page shared can only fail if the folio maybe pinned;
845	* device private folios cannot get pinned and consequently this function cannot
846	* fail.
847	*
848	* Returns 0 if marking the mapped page possibly shared succeeded. Returns
849	* -EBUSY otherwise.
850	*/
851	static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
852	struct page *page)
853	{
854	return __folio_try_share_anon_rmap(folio, page, nr_pages: `1`, level: RMAP_LEVEL_PTE);
855	}
856
857	/**
858	* folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
859	* range mapped by a PMD possibly shared to
860	* prepare for temporary unmapping
861	* @folio: The folio to share the mapping of
862	* @page: The first page to share the mapping of
863	*
864	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
865	*
866	* The caller needs to hold the page table lock and has to have the page table
867	* entries cleared/invalidated.
868	*
869	* This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
870	* fork() to duplicate a mapping, but instead to prepare for temporarily
871	* unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
872	*
873	* Marking the mapped pages shared can only fail if the folio maybe pinned;
874	* device private folios cannot get pinned and consequently this function cannot
875	* fail.
876	*
877	* Returns 0 if marking the mapped pages possibly shared succeeded. Returns
878	* -EBUSY otherwise.
879	*/
880	static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
881	struct page *page)
882	{
883	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
884	return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
885	level: RMAP_LEVEL_PMD);
886	#else
887	WARN_ON_ONCE(true);
888	return -EBUSY;
889	#endif
890	}
891
892	/*
893	* Called from mm/vmscan.c to handle paging out
894	*/
895	int folio_referenced(struct folio , int* is_locked,
896	struct mem_cgroup memcg, unsigned* long *vm_flags);
897
898	void try_to_migrate(struct folio folio, enum* ttu_flags flags);
899	void try_to_unmap(struct folio , enum* ttu_flags flags);
900
901	struct page make_device_exclusive(struct* mm_struct mm, unsigned* long addr,
902	void owner, struct* folio **foliop);
903
904	/ Avoid racy checks /
905	#define PVMW_SYNC (1 << 0)
906	/ Look for migration entries rather than present PTEs /
907	#define PVMW_MIGRATION (1 << 1)
908
909	struct page_vma_mapped_walk {
910	unsigned long pfn;
911	unsigned long nr_pages;
912	pgoff_t pgoff;
913	struct vm_area_struct *vma;
914	unsigned long address;
915	pmd_t *pmd;
916	pte_t *pte;
917	spinlock_t *ptl;
918	unsigned int flags;
919	};
920
921	#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \
922	struct page_vma_mapped_walk name = { \
923	.pfn = folio_pfn(_folio), \
924	.nr_pages = folio_nr_pages(_folio), \
925	.pgoff = folio_pgoff(_folio), \
926	.vma = _vma, \
927	.address = _address, \
928	.flags = _flags, \
929	}
930
931	static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
932	{
933	/ HugeTLB pte is set to the relevant page table entry without pte_mapped. /
934	if (pvmw->pte && !is_vm_hugetlb_page(vma: pvmw->vma))
935	pte_unmap(pte: pvmw->pte);
936	if (pvmw->ptl)
937	spin_unlock(lock: pvmw->ptl);
938	}
939
940	/**
941	* page_vma_mapped_walk_restart - Restart the page table walk.
942	* @pvmw: Pointer to struct page_vma_mapped_walk.
943	*
944	* It restarts the page table walk when changes occur in the page
945	* table, such as splitting a PMD. Ensures that the PTL held during
946	* the previous walk is released and resets the state to allow for
947	* a new walk starting at the current address stored in pvmw->address.
948	*/
949	static inline void
950	page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
951	{
952	WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);
953
954	if (likely(pvmw->ptl))
955	spin_unlock(lock: pvmw->ptl);
956	else
957	WARN_ON_ONCE(`1`);
958
959	pvmw->ptl = NULL;
960	pvmw->pmd = NULL;
961	pvmw->pte = NULL;
962	}
963
964	bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
965	unsigned long page_address_in_vma(const struct folio *folio,
966	const struct page , const* struct vm_area_struct *);
967
968	/*
969	* Cleans the PTEs of shared mappings.
970	* (and since clean PTEs should also be readonly, write protects them too)
971	*
972	* returns the number of cleaned PTEs.
973	*/
974	int folio_mkclean(struct folio *);
975
976	int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
977	unsigned long pfn, unsigned long nr_pages);
978
979	int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
980	struct vm_area_struct *vma);
981
982	enum rmp_flags {
983	RMP_LOCKED = `1` << `0`,
984	RMP_USE_SHARED_ZEROPAGE = `1` << `1`,
985	};
986
987	void remove_migration_ptes(struct folio src, struct* folio dst, int* flags);
988
989	/*
990	* rmap_walk_control: To control rmap traversing for specific needs
991	*
992	* arg: passed to rmap_one() and invalid_vma()
993	* try_lock: bail out if the rmap lock is contended
994	* contended: indicate the rmap traversal bailed out due to lock contention
995	* rmap_one: executed on each vma where page is mapped
996	* done: for checking traversing termination condition
997	* anon_lock: for getting anon_lock by optimized way rather than default
998	* invalid_vma: for skipping uninterested vma
999	*/
1000	struct rmap_walk_control {
1001	void *arg;
1002	bool try_lock;
1003	bool contended;
1004	/*
1005	* Return false if page table scanning in rmap_walk should be stopped.
1006	* Otherwise, return true.
1007	*/
1008	bool (rmap_one)(struct* folio folio, struct* vm_area_struct *vma,
1009	unsigned long addr, void *arg);
1010	int (done)(struct* folio *folio);
1011	struct anon_vma (anon_lock)(const struct folio *folio,
1012	struct rmap_walk_control *rwc);
1013	bool (invalid_vma)(struct* vm_area_struct vma, void* *arg);
1014	};
1015
1016	void rmap_walk(struct folio folio, struct* rmap_walk_control *rwc);
1017	void rmap_walk_locked(struct folio folio, struct* rmap_walk_control *rwc);
1018	struct anon_vma folio_lock_anon_vma_read(const* struct folio *folio,
1019	struct rmap_walk_control *rwc);
1020
1021	#else /* !CONFIG_MMU */
1022
1023	#define anon_vma_init() do {} while (0)
1024	#define anon_vma_prepare(vma) (0)
1025
1026	static inline int folio_referenced(struct folio folio, int* is_locked,
1027	struct mem_cgroup *memcg,
1028	unsigned long *vm_flags)
1029	{
1030	*vm_flags = `0`;
1031	return `0`;
1032	}
1033
1034	static inline void try_to_unmap(struct folio folio, enum* ttu_flags flags)
1035	{
1036	}
1037
1038	static inline int folio_mkclean(struct folio *folio)
1039	{
1040	return `0`;
1041	}
1042	#endif /* CONFIG_MMU */
1043
1044	#endif /* _LINUX_RMAP_H */
1045

source code of linux/include/linux/rmap.h