rmap.c source code [linux/mm/rmap.c]

1	/*
2	* mm/rmap.c - physical to virtual reverse mappings
3	*
4	* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5	* Released under the General Public License (GPL).
6	*
7	* Simple, low overhead reverse mapping scheme.
8	* Please try to keep this thing as modular as possible.
9	*
10	* Provides methods for unmapping each kind of mapped page:
11	* the anon methods track anonymous pages, and
12	* the file methods track pages belonging to an inode.
13	*
14	* Original design by Rik van Riel <riel@conectiva.com.br> 2001
15	* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16	* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17	* Contributions by Hugh Dickins 2003, 2004
18	*/
19
20	/*
21	* Lock ordering in mm:
22	*
23	* inode->i_rwsem (while writing or truncating, not reading or faulting)
24	* mm->mmap_lock
25	* mapping->invalidate_lock (in filemap_fault)
26	* folio_lock
27	* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
28	* vma_start_write
29	* mapping->i_mmap_rwsem
30	* anon_vma->rwsem
31	* mm->page_table_lock or pte_lock
32	* swap_lock (in swap_duplicate, swap_info_get)
33	* mmlist_lock (in mmput, drain_mmlist and others)
34	* mapping->private_lock (in block_dirty_folio)
35	* i_pages lock (widely used)
36	* lruvec->lru_lock (in folio_lruvec_lock_irq)
37	* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
38	* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
39	* sb_lock (within inode_lock in fs/fs-writeback.c)
40	* i_pages lock (widely used, in set_page_dirty,
41	* in arch-dependent flush_dcache_mmap_lock,
42	* within bdi.wb->list_lock in __sync_single_inode)
43	*
44	* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
45	* ->tasklist_lock
46	* pte map lock
47	*
48	* hugetlbfs PageHuge() take locks in this order:
49	* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
50	* vma_lock (hugetlb specific lock for pmd_sharing)
51	* mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
52	* folio_lock
53	*/
54
55	#include <linux/mm.h>
56	#include <linux/sched/mm.h>
57	#include <linux/sched/task.h>
58	#include <linux/pagemap.h>
59	#include <linux/swap.h>
60	#include <linux/swapops.h>
61	#include <linux/slab.h>
62	#include <linux/init.h>
63	#include <linux/ksm.h>
64	#include <linux/rmap.h>
65	#include <linux/rcupdate.h>
66	#include <linux/export.h>
67	#include <linux/memcontrol.h>
68	#include <linux/mmu_notifier.h>
69	#include <linux/migrate.h>
70	#include <linux/hugetlb.h>
71	#include <linux/huge_mm.h>
72	#include <linux/backing-dev.h>
73	#include <linux/page_idle.h>
74	#include <linux/memremap.h>
75	#include <linux/userfaultfd_k.h>
76	#include <linux/mm_inline.h>
77	#include <linux/oom.h>
78
79	#include <asm/tlbflush.h>
80
81	#define CREATE_TRACE_POINTS
82	#include <trace/events/tlb.h>
83	#include <trace/events/migrate.h>
84
85	#include "internal.h"
86
87	static struct kmem_cache *anon_vma_cachep;
88	static struct kmem_cache *anon_vma_chain_cachep;
89
90	static inline struct anon_vma anon_vma_alloc(void*)
91	{
92	struct anon_vma *anon_vma;
93
94	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
95	if (anon_vma) {
96	atomic_set(v: &anon_vma->refcount, i: `1`);
97	anon_vma->num_children = `0`;
98	anon_vma->num_active_vmas = `0`;
99	anon_vma->parent = anon_vma;
100	/*
101	* Initialise the anon_vma root to point to itself. If called
102	* from fork, the root will be reset to the parents anon_vma.
103	*/
104	anon_vma->root = anon_vma;
105	}
106
107	return anon_vma;
108	}
109
110	static inline void anon_vma_free(struct anon_vma *anon_vma)
111	{
112	VM_BUG_ON(atomic_read(&anon_vma->refcount));
113
114	/*
115	* Synchronize against folio_lock_anon_vma_read() such that
116	* we can safely hold the lock without the anon_vma getting
117	* freed.
118	*
119	* Relies on the full mb implied by the atomic_dec_and_test() from
120	* put_anon_vma() against the acquire barrier implied by
121	* down_read_trylock() from folio_lock_anon_vma_read(). This orders:
122	*
123	* folio_lock_anon_vma_read() VS put_anon_vma()
124	* down_read_trylock() atomic_dec_and_test()
125	* LOCK MB
126	* atomic_read() rwsem_is_locked()
127	*
128	* LOCK should suffice since the actual taking of the lock must
129	* happen _before_ what follows.
130	*/
131	might_sleep();
132	if (rwsem_is_locked(sem: &anon_vma->root->rwsem)) {
133	anon_vma_lock_write(anon_vma);
134	anon_vma_unlock_write(anon_vma);
135	}
136
137	kmem_cache_free(s: anon_vma_cachep, objp: anon_vma);
138	}
139
140	static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
141	{
142	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
143	}
144
145	static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
146	{
147	kmem_cache_free(s: anon_vma_chain_cachep, objp: anon_vma_chain);
148	}
149
150	static void anon_vma_chain_link(struct vm_area_struct *vma,
151	struct anon_vma_chain *avc,
152	struct anon_vma *anon_vma)
153	{
154	avc->vma = vma;
155	avc->anon_vma = anon_vma;
156	list_add(new: &avc->same_vma, head: &vma->anon_vma_chain);
157	anon_vma_interval_tree_insert(node: avc, root: &anon_vma->rb_root);
158	}
159
160	/**
161	* __anon_vma_prepare - attach an anon_vma to a memory region
162	* @vma: the memory region in question
163	*
164	* This makes sure the memory mapping described by 'vma' has
165	* an 'anon_vma' attached to it, so that we can associate the
166	* anonymous pages mapped into it with that anon_vma.
167	*
168	* The common case will be that we already have one, which
169	* is handled inline by anon_vma_prepare(). But if
170	* not we either need to find an adjacent mapping that we
171	* can re-use the anon_vma from (very common when the only
172	* reason for splitting a vma has been mprotect()), or we
173	* allocate a new one.
174	*
175	* Anon-vma allocations are very subtle, because we may have
176	* optimistically looked up an anon_vma in folio_lock_anon_vma_read()
177	* and that may actually touch the rwsem even in the newly
178	* allocated vma (it depends on RCU to make sure that the
179	* anon_vma isn't actually destroyed).
180	*
181	* As a result, we need to do proper anon_vma locking even
182	* for the new allocation. At the same time, we do not want
183	* to do any locking for the common case of already having
184	* an anon_vma.
185	*/
186	int __anon_vma_prepare(struct vm_area_struct *vma)
187	{
188	struct mm_struct *mm = vma->vm_mm;
189	struct anon_vma anon_vma, allocated;
190	struct anon_vma_chain *avc;
191
192	mmap_assert_locked(mm);
193	might_sleep();
194
195	avc = anon_vma_chain_alloc(GFP_KERNEL);
196	if (!avc)
197	goto out_enomem;
198
199	anon_vma = find_mergeable_anon_vma(vma);
200	allocated = NULL;
201	if (!anon_vma) {
202	anon_vma = anon_vma_alloc();
203	if (unlikely(!anon_vma))
204	goto out_enomem_free_avc;
205	anon_vma->num_children++; / self-parent link for new root /
206	allocated = anon_vma;
207	}
208
209	anon_vma_lock_write(anon_vma);
210	/ page_table_lock to protect against threads /
211	spin_lock(lock: &mm->page_table_lock);
212	if (likely(!vma->anon_vma)) {
213	vma->anon_vma = anon_vma;
214	anon_vma_chain_link(vma, avc, anon_vma);
215	anon_vma->num_active_vmas++;
216	allocated = NULL;
217	avc = NULL;
218	}
219	spin_unlock(lock: &mm->page_table_lock);
220	anon_vma_unlock_write(anon_vma);
221
222	if (unlikely(allocated))
223	put_anon_vma(anon_vma: allocated);
224	if (unlikely(avc))
225	anon_vma_chain_free(anon_vma_chain: avc);
226
227	return `0`;
228
229	out_enomem_free_avc:
230	anon_vma_chain_free(anon_vma_chain: avc);
231	out_enomem:
232	return -ENOMEM;
233	}
234
235	/*
236	* This is a useful helper function for locking the anon_vma root as
237	* we traverse the vma->anon_vma_chain, looping over anon_vma's that
238	* have the same vma.
239	*
240	* Such anon_vma's should have the same root, so you'd expect to see
241	* just a single mutex_lock for the whole traversal.
242	*/
243	static inline struct anon_vma lock_anon_vma_root(struct* anon_vma root, struct* anon_vma *anon_vma)
244	{
245	struct anon_vma *new_root = anon_vma->root;
246	if (new_root != root) {
247	if (WARN_ON_ONCE(root))
248	up_write(sem: &root->rwsem);
249	root = new_root;
250	down_write(sem: &root->rwsem);
251	}
252	return root;
253	}
254
255	static inline void unlock_anon_vma_root(struct anon_vma *root)
256	{
257	if (root)
258	up_write(sem: &root->rwsem);
259	}
260
261	/*
262	* Attach the anon_vmas from src to dst.
263	* Returns 0 on success, -ENOMEM on failure.
264	*
265	* anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
266	* copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
267	* while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
268	* prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
269	* call, we can identify this case by checking (!dst->anon_vma &&
270	* src->anon_vma).
271	*
272	* If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
273	* and reuse existing anon_vma which has no vmas and only one child anon_vma.
274	* This prevents degradation of anon_vma hierarchy to endless linear chain in
275	* case of constantly forking task. On the other hand, an anon_vma with more
276	* than one child isn't reused even if there was no alive vma, thus rmap
277	* walker has a good chance of avoiding scanning the whole hierarchy when it
278	* searches where page is mapped.
279	*/
280	int anon_vma_clone(struct vm_area_struct dst, struct* vm_area_struct *src)
281	{
282	struct anon_vma_chain avc, pavc;
283	struct anon_vma *root = NULL;
284
285	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
286	struct anon_vma *anon_vma;
287
288	avc = anon_vma_chain_alloc(GFP_NOWAIT \| __GFP_NOWARN);
289	if (unlikely(!avc)) {
290	unlock_anon_vma_root(root);
291	root = NULL;
292	avc = anon_vma_chain_alloc(GFP_KERNEL);
293	if (!avc)
294	goto enomem_failure;
295	}
296	anon_vma = pavc->anon_vma;
297	root = lock_anon_vma_root(root, anon_vma);
298	anon_vma_chain_link(vma: dst, avc, anon_vma);
299
300	/*
301	* Reuse existing anon_vma if it has no vma and only one
302	* anon_vma child.
303	*
304	* Root anon_vma is never reused:
305	* it has self-parent reference and at least one child.
306	*/
307	if (!dst->anon_vma && src->anon_vma &&
308	anon_vma->num_children < `2` &&
309	anon_vma->num_active_vmas == `0`)
310	dst->anon_vma = anon_vma;
311	}
312	if (dst->anon_vma)
313	dst->anon_vma->num_active_vmas++;
314	unlock_anon_vma_root(root);
315	return `0`;
316
317	enomem_failure:
318	/*
319	* dst->anon_vma is dropped here otherwise its num_active_vmas can
320	* be incorrectly decremented in unlink_anon_vmas().
321	* We can safely do this because callers of anon_vma_clone() don't care
322	* about dst->anon_vma if anon_vma_clone() failed.
323	*/
324	dst->anon_vma = NULL;
325	unlink_anon_vmas(dst);
326	return -ENOMEM;
327	}
328
329	/*
330	* Attach vma to its own anon_vma, as well as to the anon_vmas that
331	* the corresponding VMA in the parent process is attached to.
332	* Returns 0 on success, non-zero on failure.
333	*/
334	int anon_vma_fork(struct vm_area_struct vma, struct* vm_area_struct *pvma)
335	{
336	struct anon_vma_chain *avc;
337	struct anon_vma *anon_vma;
338	int error;
339
340	/ Don't bother if the parent process has no anon_vma here. /
341	if (!pvma->anon_vma)
342	return `0`;
343
344	/ Drop inherited anon_vma, we'll reuse existing or allocate new. /
345	vma->anon_vma = NULL;
346
347	/*
348	* First, attach the new VMA to the parent VMA's anon_vmas,
349	* so rmap can find non-COWed pages in child processes.
350	*/
351	error = anon_vma_clone(dst: vma, src: pvma);
352	if (error)
353	return error;
354
355	/ An existing anon_vma has been reused, all done then. /
356	if (vma->anon_vma)
357	return `0`;
358
359	/ Then add our own anon_vma. /
360	anon_vma = anon_vma_alloc();
361	if (!anon_vma)
362	goto out_error;
363	anon_vma->num_active_vmas++;
364	avc = anon_vma_chain_alloc(GFP_KERNEL);
365	if (!avc)
366	goto out_error_free_anon_vma;
367
368	/*
369	* The root anon_vma's rwsem is the lock actually used when we
370	* lock any of the anon_vmas in this anon_vma tree.
371	*/
372	anon_vma->root = pvma->anon_vma->root;
373	anon_vma->parent = pvma->anon_vma;
374	/*
375	* With refcounts, an anon_vma can stay around longer than the
376	* process it belongs to. The root anon_vma needs to be pinned until
377	* this anon_vma is freed, because the lock lives in the root.
378	*/
379	get_anon_vma(anon_vma: anon_vma->root);
380	/ Mark this anon_vma as the one where our new (COWed) pages go. /
381	vma->anon_vma = anon_vma;
382	anon_vma_lock_write(anon_vma);
383	anon_vma_chain_link(vma, avc, anon_vma);
384	anon_vma->parent->num_children++;
385	anon_vma_unlock_write(anon_vma);
386
387	return `0`;
388
389	out_error_free_anon_vma:
390	put_anon_vma(anon_vma);
391	out_error:
392	unlink_anon_vmas(vma);
393	return -ENOMEM;
394	}
395
396	void unlink_anon_vmas(struct vm_area_struct *vma)
397	{
398	struct anon_vma_chain avc, next;
399	struct anon_vma *root = NULL;
400
401	/*
402	* Unlink each anon_vma chained to the VMA. This list is ordered
403	* from newest to oldest, ensuring the root anon_vma gets freed last.
404	*/
405	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
406	struct anon_vma *anon_vma = avc->anon_vma;
407
408	root = lock_anon_vma_root(root, anon_vma);
409	anon_vma_interval_tree_remove(node: avc, root: &anon_vma->rb_root);
410
411	/*
412	* Leave empty anon_vmas on the list - we'll need
413	* to free them outside the lock.
414	*/
415	if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
416	anon_vma->parent->num_children--;
417	continue;
418	}
419
420	list_del(entry: &avc->same_vma);
421	anon_vma_chain_free(anon_vma_chain: avc);
422	}
423	if (vma->anon_vma) {
424	vma->anon_vma->num_active_vmas--;
425
426	/*
427	* vma would still be needed after unlink, and anon_vma will be prepared
428	* when handle fault.
429	*/
430	vma->anon_vma = NULL;
431	}
432	unlock_anon_vma_root(root);
433
434	/*
435	* Iterate the list once more, it now only contains empty and unlinked
436	* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
437	* needing to write-acquire the anon_vma->root->rwsem.
438	*/
439	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
440	struct anon_vma *anon_vma = avc->anon_vma;
441
442	VM_WARN_ON(anon_vma->num_children);
443	VM_WARN_ON(anon_vma->num_active_vmas);
444	put_anon_vma(anon_vma);
445
446	list_del(entry: &avc->same_vma);
447	anon_vma_chain_free(anon_vma_chain: avc);
448	}
449	}
450
451	static void anon_vma_ctor(void *data)
452	{
453	struct anon_vma *anon_vma = data;
454
455	init_rwsem(&anon_vma->rwsem);
456	atomic_set(v: &anon_vma->refcount, i: `0`);
457	anon_vma->rb_root = RB_ROOT_CACHED;
458	}
459
460	void __init anon_vma_init(void)
461	{
462	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
463	`0`, SLAB_TYPESAFE_BY_RCU\|SLAB_PANIC\|SLAB_ACCOUNT,
464	anon_vma_ctor);
465	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
466	SLAB_PANIC\|SLAB_ACCOUNT);
467	}
468
469	/*
470	* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
471	*
472	* Since there is no serialization what so ever against folio_remove_rmap_*()
473	* the best this function can do is return a refcount increased anon_vma
474	* that might have been relevant to this page.
475	*
476	* The page might have been remapped to a different anon_vma or the anon_vma
477	* returned may already be freed (and even reused).
478	*
479	* In case it was remapped to a different anon_vma, the new anon_vma will be a
480	* child of the old anon_vma, and the anon_vma lifetime rules will therefore
481	* ensure that any anon_vma obtained from the page will still be valid for as
482	* long as we observe page_mapped() [ hence all those page_mapped() tests ].
483	*
484	* All users of this function must be very careful when walking the anon_vma
485	* chain and verify that the page in question is indeed mapped in it
486	* [ something equivalent to page_mapped_in_vma() ].
487	*
488	* Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
489	* folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
490	* if there is a mapcount, we can dereference the anon_vma after observing
491	* those.
492	*
493	* NOTE: the caller should normally hold folio lock when calling this. If
494	* not, the caller needs to double check the anon_vma didn't change after
495	* taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
496	* concurrently without folio lock protection). See folio_lock_anon_vma_read()
497	* which has already covered that, and comment above remap_pages().
498	*/
499	struct anon_vma folio_get_anon_vma(const* struct folio *folio)
500	{
501	struct anon_vma *anon_vma = NULL;
502	unsigned long anon_mapping;
503
504	rcu_read_lock();
505	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
506	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
507	goto out;
508	if (!folio_mapped(folio))
509	goto out;
510
511	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
512	if (!atomic_inc_not_zero(v: &anon_vma->refcount)) {
513	anon_vma = NULL;
514	goto out;
515	}
516
517	/*
518	* If this folio is still mapped, then its anon_vma cannot have been
519	* freed. But if it has been unmapped, we have no security against the
520	* anon_vma structure being freed and reused (for another anon_vma:
521	* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
522	* above cannot corrupt).
523	*/
524	if (!folio_mapped(folio)) {
525	rcu_read_unlock();
526	put_anon_vma(anon_vma);
527	return NULL;
528	}
529	out:
530	rcu_read_unlock();
531
532	return anon_vma;
533	}
534
535	/*
536	* Similar to folio_get_anon_vma() except it locks the anon_vma.
537	*
538	* Its a little more complex as it tries to keep the fast path to a single
539	* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
540	* reference like with folio_get_anon_vma() and then block on the mutex
541	* on !rwc->try_lock case.
542	*/
543	struct anon_vma folio_lock_anon_vma_read(const* struct folio *folio,
544	struct rmap_walk_control *rwc)
545	{
546	struct anon_vma *anon_vma = NULL;
547	struct anon_vma *root_anon_vma;
548	unsigned long anon_mapping;
549
550	retry:
551	rcu_read_lock();
552	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
553	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
554	goto out;
555	if (!folio_mapped(folio))
556	goto out;
557
558	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
559	root_anon_vma = READ_ONCE(anon_vma->root);
560	if (down_read_trylock(sem: &root_anon_vma->rwsem)) {
561	/*
562	* folio_move_anon_rmap() might have changed the anon_vma as we
563	* might not hold the folio lock here.
564	*/
565	if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
566	anon_mapping)) {
567	up_read(sem: &root_anon_vma->rwsem);
568	rcu_read_unlock();
569	goto retry;
570	}
571
572	/*
573	* If the folio is still mapped, then this anon_vma is still
574	* its anon_vma, and holding the mutex ensures that it will
575	* not go away, see anon_vma_free().
576	*/
577	if (!folio_mapped(folio)) {
578	up_read(sem: &root_anon_vma->rwsem);
579	anon_vma = NULL;
580	}
581	goto out;
582	}
583
584	if (rwc && rwc->try_lock) {
585	anon_vma = NULL;
586	rwc->contended = true;
587	goto out;
588	}
589
590	/ trylock failed, we got to sleep /
591	if (!atomic_inc_not_zero(v: &anon_vma->refcount)) {
592	anon_vma = NULL;
593	goto out;
594	}
595
596	if (!folio_mapped(folio)) {
597	rcu_read_unlock();
598	put_anon_vma(anon_vma);
599	return NULL;
600	}
601
602	/ we pinned the anon_vma, its safe to sleep /
603	rcu_read_unlock();
604	anon_vma_lock_read(anon_vma);
605
606	/*
607	* folio_move_anon_rmap() might have changed the anon_vma as we might
608	* not hold the folio lock here.
609	*/
610	if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
611	anon_mapping)) {
612	anon_vma_unlock_read(anon_vma);
613	put_anon_vma(anon_vma);
614	anon_vma = NULL;
615	goto retry;
616	}
617
618	if (atomic_dec_and_test(v: &anon_vma->refcount)) {
619	/*
620	* Oops, we held the last refcount, release the lock
621	* and bail -- can't simply use put_anon_vma() because
622	* we'll deadlock on the anon_vma_lock_write() recursion.
623	*/
624	anon_vma_unlock_read(anon_vma);
625	__put_anon_vma(anon_vma);
626	anon_vma = NULL;
627	}
628
629	return anon_vma;
630
631	out:
632	rcu_read_unlock();
633	return anon_vma;
634	}
635
636	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
637	/*
638	* Flush TLB entries for recently unmapped pages from remote CPUs. It is
639	* important if a PTE was dirty when it was unmapped that it's flushed
640	* before any IO is initiated on the page to prevent lost writes. Similarly,
641	* it must be flushed before freeing to prevent data leakage.
642	*/
643	void try_to_unmap_flush(void)
644	{
645	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
646
647	if (!tlb_ubc->flush_required)
648	return;
649
650	arch_tlbbatch_flush(batch: &tlb_ubc->arch);
651	tlb_ubc->flush_required = false;
652	tlb_ubc->writable = false;
653	}
654
655	/ Flush iff there are potentially writable TLB entries that can race with IO /
656	void try_to_unmap_flush_dirty(void)
657	{
658	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
659
660	if (tlb_ubc->writable)
661	try_to_unmap_flush();
662	}
663
664	/*
665	* Bits 0-14 of mm->tlb_flush_batched record pending generations.
666	* Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
667	*/
668	#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
669	#define TLB_FLUSH_BATCH_PENDING_MASK \
670	((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
671	#define TLB_FLUSH_BATCH_PENDING_LARGE \
672	(TLB_FLUSH_BATCH_PENDING_MASK / 2)
673
674	static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
675	unsigned long start, unsigned long end)
676	{
677	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
678	int batch;
679	bool writable = pte_dirty(pte: pteval);
680
681	if (!pte_accessible(mm, a: pteval))
682	return;
683
684	arch_tlbbatch_add_pending(batch: &tlb_ubc->arch, mm, start, end);
685	tlb_ubc->flush_required = true;
686
687	/*
688	* Ensure compiler does not re-order the setting of tlb_flush_batched
689	* before the PTE is cleared.
690	*/
691	barrier();
692	batch = atomic_read(v: &mm->tlb_flush_batched);
693	retry:
694	if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
695	/*
696	* Prevent `pending' from catching up with `flushed' because of
697	* overflow. Reset `pending' and `flushed' to be 1 and 0 if
698	* `pending' becomes large.
699	*/
700	if (!atomic_try_cmpxchg(v: &mm->tlb_flush_batched, old: &batch, new: `1`))
701	goto retry;
702	} else {
703	atomic_inc(v: &mm->tlb_flush_batched);
704	}
705
706	/*
707	* If the PTE was dirty then it's best to assume it's writable. The
708	* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
709	* before the page is queued for IO.
710	*/
711	if (writable)
712	tlb_ubc->writable = true;
713	}
714
715	/*
716	* Returns true if the TLB flush should be deferred to the end of a batch of
717	* unmap operations to reduce IPIs.
718	*/
719	static bool should_defer_flush(struct mm_struct mm, enum* ttu_flags flags)
720	{
721	if (!(flags & TTU_BATCH_FLUSH))
722	return false;
723
724	return arch_tlbbatch_should_defer(mm);
725	}
726
727	/*
728	* Reclaim unmaps pages under the PTL but do not flush the TLB prior to
729	* releasing the PTL if TLB flushes are batched. It's possible for a parallel
730	* operation such as mprotect or munmap to race between reclaim unmapping
731	* the page and flushing the page. If this race occurs, it potentially allows
732	* access to data via a stale TLB entry. Tracking all mm's that have TLB
733	* batching in flight would be expensive during reclaim so instead track
734	* whether TLB batching occurred in the past and if so then do a flush here
735	* if required. This will cost one additional flush per reclaim cycle paid
736	* by the first operation at risk such as mprotect and mumap.
737	*
738	* This must be called under the PTL so that an access to tlb_flush_batched
739	* that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
740	* via the PTL.
741	*/
742	void flush_tlb_batched_pending(struct mm_struct *mm)
743	{
744	int batch = atomic_read(v: &mm->tlb_flush_batched);
745	int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
746	int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
747
748	if (pending != flushed) {
749	arch_flush_tlb_batched_pending(mm);
750	/*
751	* If the new TLB flushing is pending during flushing, leave
752	* mm->tlb_flush_batched as is, to avoid losing flushing.
753	*/
754	atomic_cmpxchg(v: &mm->tlb_flush_batched, old: batch,
755	new: pending \| (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
756	}
757	}
758	#else
759	static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
760	unsigned long start, unsigned long end)
761	{
762	}
763
764	static bool should_defer_flush(struct mm_struct mm, enum* ttu_flags flags)
765	{
766	return false;
767	}
768	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
769
770	/**
771	* page_address_in_vma - The virtual address of a page in this VMA.
772	* @folio: The folio containing the page.
773	* @page: The page within the folio.
774	* @vma: The VMA we need to know the address in.
775	*
776	* Calculates the user virtual address of this page in the specified VMA.
777	* It is the caller's responsibility to check the page is actually
778	* within the VMA. There may not currently be a PTE pointing at this
779	* page, but if a page fault occurs at this address, this is the page
780	* which will be accessed.
781	*
782	* Context: Caller should hold a reference to the folio. Caller should
783	* hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
784	* VMA from being altered.
785	*
786	* Return: The virtual address corresponding to this page in the VMA.
787	*/
788	unsigned long page_address_in_vma(const struct folio *folio,
789	const struct page page, const* struct vm_area_struct *vma)
790	{
791	if (folio_test_anon(folio)) {
792	struct anon_vma *anon_vma = folio_anon_vma(folio);
793	/*
794	* Note: swapoff's unuse_vma() is more efficient with this
795	* check, and needs it to match anon_vma when KSM is active.
796	*/
797	if (!vma->anon_vma \|\| !anon_vma \|\|
798	vma->anon_vma->root != anon_vma->root)
799	return -EFAULT;
800	} else if (!vma->vm_file) {
801	return -EFAULT;
802	} else if (vma->vm_file->f_mapping != folio->mapping) {
803	return -EFAULT;
804	}
805
806	/ KSM folios don't reach here because of the !anon_vma check /
807	return vma_address(vma, pgoff: page_pgoff(folio, page), nr_pages: `1`);
808	}
809
810	/*
811	* Returns the actual pmd_t* where we expect 'address' to be mapped from, or
812	* NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
813	* represents.
814	*/
815	pmd_t mm_find_pmd(struct* mm_struct mm, unsigned* long address)
816	{
817	pgd_t *pgd;
818	p4d_t *p4d;
819	pud_t *pud;
820	pmd_t *pmd = NULL;
821
822	pgd = pgd_offset(mm, address);
823	if (!pgd_present(pgd: *pgd))
824	goto out;
825
826	p4d = p4d_offset(pgd, address);
827	if (!p4d_present(p4d: *p4d))
828	goto out;
829
830	pud = pud_offset(p4d, address);
831	if (!pud_present(pud: *pud))
832	goto out;
833
834	pmd = pmd_offset(pud, address);
835	out:
836	return pmd;
837	}
838
839	struct folio_referenced_arg {
840	int mapcount;
841	int referenced;
842	unsigned long vm_flags;
843	struct mem_cgroup *memcg;
844	};
845
846	/*
847	* arg: folio_referenced_arg will be passed
848	*/
849	static bool folio_referenced_one(struct folio *folio,
850	struct vm_area_struct vma, unsigned* long address, void *arg)
851	{
852	struct folio_referenced_arg *pra = arg;
853	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, `0`);
854	int referenced = `0`;
855	unsigned long start = address, ptes = `0`;
856
857	while (page_vma_mapped_walk(pvmw: &pvmw)) {
858	address = pvmw.address;
859
860	if (vma->vm_flags & VM_LOCKED) {
861	if (!folio_test_large(folio) \|\| !pvmw.pte) {
862	/ Restore the mlock which got missed /
863	mlock_vma_folio(folio, vma);
864	page_vma_mapped_walk_done(pvmw: &pvmw);
865	pra->vm_flags \|= VM_LOCKED;
866	return false; / To break the loop /
867	}
868	/*
869	* For large folio fully mapped to VMA, will
870	* be handled after the pvmw loop.
871	*
872	* For large folio cross VMA boundaries, it's
873	* expected to be picked by page reclaim. But
874	* should skip reference of pages which are in
875	* the range of VM_LOCKED vma. As page reclaim
876	* should just count the reference of pages out
877	* the range of VM_LOCKED vma.
878	*/
879	ptes++;
880	pra->mapcount--;
881	continue;
882	}
883
884	/*
885	* Skip the non-shared swapbacked folio mapped solely by
886	* the exiting or OOM-reaped process. This avoids redundant
887	* swap-out followed by an immediate unmap.
888	*/
889	if ((!atomic_read(v: &vma->vm_mm->mm_users) \|\|
890	check_stable_address_space(mm: vma->vm_mm)) &&
891	folio_test_anon(folio) && folio_test_swapbacked(folio) &&
892	!folio_maybe_mapped_shared(folio)) {
893	pra->referenced = -`1`;
894	page_vma_mapped_walk_done(pvmw: &pvmw);
895	return false;
896	}
897
898	if (lru_gen_enabled() && pvmw.pte) {
899	if (lru_gen_look_around(pvmw: &pvmw))
900	referenced++;
901	} else if (pvmw.pte) {
902	if (ptep_clear_flush_young_notify(vma, address,
903	pvmw.pte))
904	referenced++;
905	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
906	if (pmdp_clear_flush_young_notify(vma, address,
907	pvmw.pmd))
908	referenced++;
909	} else {
910	/ unexpected pmd-mapped folio? /
911	WARN_ON_ONCE(`1`);
912	}
913
914	pra->mapcount--;
915	}
916
917	if ((vma->vm_flags & VM_LOCKED) &&
918	folio_test_large(folio) &&
919	folio_within_vma(folio, vma)) {
920	unsigned long s_align, e_align;
921
922	s_align = ALIGN_DOWN(start, PMD_SIZE);
923	e_align = ALIGN_DOWN(start + folio_size(folio) - `1`, PMD_SIZE);
924
925	/ folio doesn't cross page table boundary and fully mapped /
926	if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
927	/ Restore the mlock which got missed /
928	mlock_vma_folio(folio, vma);
929	pra->vm_flags \|= VM_LOCKED;
930	return false; / To break the loop /
931	}
932	}
933
934	if (referenced)
935	folio_clear_idle(folio);
936	if (folio_test_clear_young(folio))
937	referenced++;
938
939	if (referenced) {
940	pra->referenced++;
941	pra->vm_flags \|= vma->vm_flags & ~VM_LOCKED;
942	}
943
944	if (!pra->mapcount)
945	return false; / To break the loop /
946
947	return true;
948	}
949
950	static bool invalid_folio_referenced_vma(struct vm_area_struct vma, void* *arg)
951	{
952	struct folio_referenced_arg *pra = arg;
953	struct mem_cgroup *memcg = pra->memcg;
954
955	/*
956	* Ignore references from this mapping if it has no recency. If the
957	* folio has been used in another mapping, we will catch it; if this
958	* other mapping is already gone, the unmap path will have set the
959	* referenced flag or activated the folio in zap_pte_range().
960	*/
961	if (!vma_has_recency(vma))
962	return true;
963
964	/*
965	* If we are reclaiming on behalf of a cgroup, skip counting on behalf
966	* of references from different cgroups.
967	*/
968	if (memcg && !mm_match_cgroup(mm: vma->vm_mm, memcg))
969	return true;
970
971	return false;
972	}
973
974	/**
975	* folio_referenced() - Test if the folio was referenced.
976	* @folio: The folio to test.
977	* @is_locked: Caller holds lock on the folio.
978	* @memcg: target memory cgroup
979	* @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
980	*
981	* Quick test_and_clear_referenced for all mappings of a folio,
982	*
983	* Return: The number of mappings which referenced the folio. Return -1 if
984	* the function bailed out due to rmap lock contention.
985	*/
986	int folio_referenced(struct folio folio, int* is_locked,
987	struct mem_cgroup memcg, unsigned* long *vm_flags)
988	{
989	bool we_locked = false;
990	struct folio_referenced_arg pra = {
991	.mapcount = folio_mapcount(folio),
992	.memcg = memcg,
993	};
994	struct rmap_walk_control rwc = {
995	.rmap_one = folio_referenced_one,
996	.arg = (void *)&pra,
997	.anon_lock = folio_lock_anon_vma_read,
998	.try_lock = true,
999	.invalid_vma = invalid_folio_referenced_vma,
1000	};
1001
1002	*vm_flags = `0`;
1003	if (!pra.mapcount)
1004	return `0`;
1005
1006	if (!folio_raw_mapping(folio))
1007	return `0`;
1008
1009	if (!is_locked && (!folio_test_anon(folio) \|\| folio_test_ksm(folio))) {
1010	we_locked = folio_trylock(folio);
1011	if (!we_locked)
1012	return `1`;
1013	}
1014
1015	rmap_walk(folio, rwc: &rwc);
1016	*vm_flags = pra.vm_flags;
1017
1018	if (we_locked)
1019	folio_unlock(folio);
1020
1021	return rwc.contended ? -`1` : pra.referenced;
1022	}
1023
1024	static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
1025	{
1026	int cleaned = `0`;
1027	struct vm_area_struct *vma = pvmw->vma;
1028	struct mmu_notifier_range range;
1029	unsigned long address = pvmw->address;
1030
1031	/*
1032	* We have to assume the worse case ie pmd for invalidation. Note that
1033	* the folio can not be freed from this function.
1034	*/
1035	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_PROTECTION_PAGE, flags: `0`,
1036	mm: vma->vm_mm, start: address, end: vma_address_end(pvmw));
1037	mmu_notifier_invalidate_range_start(range: &range);
1038
1039	while (page_vma_mapped_walk(pvmw)) {
1040	int ret = `0`;
1041
1042	address = pvmw->address;
1043	if (pvmw->pte) {
1044	pte_t *pte = pvmw->pte;
1045	pte_t entry = ptep_get(ptep: pte);
1046
1047	/*
1048	* PFN swap PTEs, such as device-exclusive ones, that
1049	* actually map pages are clean and not writable from a
1050	* CPU perspective. The MMU notifier takes care of any
1051	* device aspects.
1052	*/
1053	if (!pte_present(a: entry))
1054	continue;
1055	if (!pte_dirty(pte: entry) && !pte_write(pte: entry))
1056	continue;
1057
1058	flush_cache_page(vma, vmaddr: address, pfn: pte_pfn(pte: entry));
1059	entry = ptep_clear_flush(vma, address, ptep: pte);
1060	entry = pte_wrprotect(pte: entry);
1061	entry = pte_mkclean(pte: entry);
1062	set_pte_at(vma->vm_mm, address, pte, entry);
1063	ret = `1`;
1064	} else {
1065	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1066	pmd_t *pmd = pvmw->pmd;
1067	pmd_t entry;
1068
1069	if (!pmd_dirty(pmd: pmd) && !pmd_write(pmd: pmd))
1070	continue;
1071
1072	flush_cache_range(vma, start: address,
1073	end: address + HPAGE_PMD_SIZE);
1074	entry = pmdp_invalidate(vma, address, pmdp: pmd);
1075	entry = pmd_wrprotect(pmd: entry);
1076	entry = pmd_mkclean(pmd: entry);
1077	set_pmd_at(mm: vma->vm_mm, addr: address, pmdp: pmd, pmd: entry);
1078	ret = `1`;
1079	#else
1080	/ unexpected pmd-mapped folio? /
1081	WARN_ON_ONCE(`1`);
1082	#endif
1083	}
1084
1085	if (ret)
1086	cleaned++;
1087	}
1088
1089	mmu_notifier_invalidate_range_end(range: &range);
1090
1091	return cleaned;
1092	}
1093
1094	static bool page_mkclean_one(struct folio folio, struct* vm_area_struct *vma,
1095	unsigned long address, void *arg)
1096	{
1097	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
1098	int *cleaned = arg;
1099
1100	*cleaned += page_vma_mkclean_one(pvmw: &pvmw);
1101
1102	return true;
1103	}
1104
1105	static bool invalid_mkclean_vma(struct vm_area_struct vma, void* *arg)
1106	{
1107	if (vma->vm_flags & VM_SHARED)
1108	return false;
1109
1110	return true;
1111	}
1112
1113	int folio_mkclean(struct folio *folio)
1114	{
1115	int cleaned = `0`;
1116	struct address_space *mapping;
1117	struct rmap_walk_control rwc = {
1118	.arg = (void *)&cleaned,
1119	.rmap_one = page_mkclean_one,
1120	.invalid_vma = invalid_mkclean_vma,
1121	};
1122
1123	BUG_ON(!folio_test_locked(folio));
1124
1125	if (!folio_mapped(folio))
1126	return `0`;
1127
1128	mapping = folio_mapping(folio);
1129	if (!mapping)
1130	return `0`;
1131
1132	rmap_walk(folio, rwc: &rwc);
1133
1134	return cleaned;
1135	}
1136	EXPORT_SYMBOL_GPL(folio_mkclean);
1137
1138	struct wrprotect_file_state {
1139	int cleaned;
1140	pgoff_t pgoff;
1141	unsigned long pfn;
1142	unsigned long nr_pages;
1143	};
1144
1145	static bool mapping_wrprotect_range_one(struct folio *folio,
1146	struct vm_area_struct vma, unsigned* long address, void *arg)
1147	{
1148	struct wrprotect_file_state state = (struct* wrprotect_file_state *)arg;
1149	struct page_vma_mapped_walk pvmw = {
1150	.pfn = state->pfn,
1151	.nr_pages = state->nr_pages,
1152	.pgoff = state->pgoff,
1153	.vma = vma,
1154	.address = address,
1155	.flags = PVMW_SYNC,
1156	};
1157
1158	state->cleaned += page_vma_mkclean_one(pvmw: &pvmw);
1159
1160	return true;
1161	}
1162
1163	static void __rmap_walk_file(struct folio folio, struct* address_space *mapping,
1164	pgoff_t pgoff_start, unsigned long nr_pages,
1165	struct rmap_walk_control *rwc, bool locked);
1166
1167	/**
1168	* mapping_wrprotect_range() - Write-protect all mappings in a specified range.
1169	*
1170	* @mapping: The mapping whose reverse mapping should be traversed.
1171	* @pgoff: The page offset at which @pfn is mapped within @mapping.
1172	* @pfn: The PFN of the page mapped in @mapping at @pgoff.
1173	* @nr_pages: The number of physically contiguous base pages spanned.
1174	*
1175	* Traverses the reverse mapping, finding all VMAs which contain a shared
1176	* mapping of the pages in the specified range in @mapping, and write-protects
1177	* them (that is, updates the page tables to mark the mappings read-only such
1178	* that a write protection fault arises when the mappings are written to).
1179	*
1180	* The @pfn value need not refer to a folio, but rather can reference a kernel
1181	* allocation which is mapped into userland. We therefore do not require that
1182	* the page maps to a folio with a valid mapping or index field, rather the
1183	* caller specifies these in @mapping and @pgoff.
1184	*
1185	* Return: the number of write-protected PTEs, or an error.
1186	*/
1187	int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
1188	unsigned long pfn, unsigned long nr_pages)
1189	{
1190	struct wrprotect_file_state state = {
1191	.cleaned = `0`,
1192	.pgoff = pgoff,
1193	.pfn = pfn,
1194	.nr_pages = nr_pages,
1195	};
1196	struct rmap_walk_control rwc = {
1197	.arg = (void *)&state,
1198	.rmap_one = mapping_wrprotect_range_one,
1199	.invalid_vma = invalid_mkclean_vma,
1200	};
1201
1202	if (!mapping)
1203	return `0`;
1204
1205	__rmap_walk_file(/ folio = /NULL, mapping, pgoff_start: pgoff, nr_pages, rwc: &rwc,
1206	/ locked = /false);
1207
1208	return state.cleaned;
1209	}
1210	EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
1211
1212	/**
1213	* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
1214	* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
1215	* within the @vma of shared mappings. And since clean PTEs
1216	* should also be readonly, write protects them too.
1217	* @pfn: start pfn.
1218	* @nr_pages: number of physically contiguous pages srarting with @pfn.
1219	* @pgoff: page offset that the @pfn mapped with.
1220	* @vma: vma that @pfn mapped within.
1221	*
1222	* Returns the number of cleaned PTEs (including PMDs).
1223	*/
1224	int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
1225	struct vm_area_struct *vma)
1226	{
1227	struct page_vma_mapped_walk pvmw = {
1228	.pfn = pfn,
1229	.nr_pages = nr_pages,
1230	.pgoff = pgoff,
1231	.vma = vma,
1232	.flags = PVMW_SYNC,
1233	};
1234
1235	if (invalid_mkclean_vma(vma, NULL))
1236	return `0`;
1237
1238	pvmw.address = vma_address(vma, pgoff, nr_pages);
1239	VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
1240
1241	return page_vma_mkclean_one(pvmw: &pvmw);
1242	}
1243
1244	static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
1245	struct page page, int* nr_pages, struct vm_area_struct *vma,
1246	enum rmap_level level, int *nr_pmdmapped)
1247	{
1248	atomic_t *mapped = &folio->_nr_pages_mapped;
1249	const int orig_nr_pages = nr_pages;
1250	int first = `0`, nr = `0`;
1251
1252	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
1253
1254	switch (level) {
1255	case RMAP_LEVEL_PTE:
1256	if (!folio_test_large(folio)) {
1257	nr = atomic_inc_and_test(v: &folio->_mapcount);
1258	break;
1259	}
1260
1261	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1262	nr = folio_add_return_large_mapcount(folio, diff: orig_nr_pages, vma);
1263	if (nr == orig_nr_pages)
1264	/ Was completely unmapped. /
1265	nr = folio_large_nr_pages(folio);
1266	else
1267	nr = `0`;
1268	break;
1269	}
1270
1271	do {
1272	first += atomic_inc_and_test(v: &page->_mapcount);
1273	} while (page++, --nr_pages > `0`);
1274
1275	if (first &&
1276	atomic_add_return_relaxed(i: first, v: mapped) < ENTIRELY_MAPPED)
1277	nr = first;
1278
1279	folio_add_large_mapcount(folio, diff: orig_nr_pages, vma);
1280	break;
1281	case RMAP_LEVEL_PMD:
1282	case RMAP_LEVEL_PUD:
1283	first = atomic_inc_and_test(v: &folio->_entire_mapcount);
1284	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1285	if (level == RMAP_LEVEL_PMD && first)
1286	*nr_pmdmapped = folio_large_nr_pages(folio);
1287	nr = folio_inc_return_large_mapcount(folio, vma);
1288	if (nr == `1`)
1289	/ Was completely unmapped. /
1290	nr = folio_large_nr_pages(folio);
1291	else
1292	nr = `0`;
1293	break;
1294	}
1295
1296	if (first) {
1297	nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, v: mapped);
1298	if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
1299	nr_pages = folio_large_nr_pages(folio);
1300	/*
1301	* We only track PMD mappings of PMD-sized
1302	* folios separately.
1303	*/
1304	if (level == RMAP_LEVEL_PMD)
1305	*nr_pmdmapped = nr_pages;
1306	nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
1307	/ Raced ahead of a remove and another add? /
1308	if (unlikely(nr < `0`))
1309	nr = `0`;
1310	} else {
1311	/ Raced ahead of a remove of ENTIRELY_MAPPED /
1312	nr = `0`;
1313	}
1314	}
1315	folio_inc_large_mapcount(folio, vma);
1316	break;
1317	}
1318	return nr;
1319	}
1320
1321	/**
1322	* folio_move_anon_rmap - move a folio to our anon_vma
1323	* @folio: The folio to move to our anon_vma
1324	* @vma: The vma the folio belongs to
1325	*
1326	* When a folio belongs exclusively to one process after a COW event,
1327	* that folio can be moved into the anon_vma that belongs to just that
1328	* process, so the rmap code will not search the parent or sibling processes.
1329	*/
1330	void folio_move_anon_rmap(struct folio folio, struct* vm_area_struct *vma)
1331	{
1332	void *anon_vma = vma->anon_vma;
1333
1334	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1335	VM_BUG_ON_VMA(!anon_vma, vma);
1336
1337	anon_vma += PAGE_MAPPING_ANON;
1338	/*
1339	* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
1340	* simultaneously, so a concurrent reader (eg folio_referenced()'s
1341	* folio_test_anon()) will not see one without the other.
1342	*/
1343	WRITE_ONCE(folio->mapping, anon_vma);
1344	}
1345
1346	/**
1347	* __folio_set_anon - set up a new anonymous rmap for a folio
1348	* @folio: The folio to set up the new anonymous rmap for.
1349	* @vma: VM area to add the folio to.
1350	* @address: User virtual address of the mapping
1351	* @exclusive: Whether the folio is exclusive to the process.
1352	*/
1353	static void __folio_set_anon(struct folio folio, struct* vm_area_struct *vma,
1354	unsigned long address, bool exclusive)
1355	{
1356	struct anon_vma *anon_vma = vma->anon_vma;
1357
1358	BUG_ON(!anon_vma);
1359
1360	/*
1361	* If the folio isn't exclusive to this vma, we must use the _oldest_
1362	* possible anon_vma for the folio mapping!
1363	*/
1364	if (!exclusive)
1365	anon_vma = anon_vma->root;
1366
1367	/*
1368	* page_idle does a lockless/optimistic rmap scan on folio->mapping.
1369	* Make sure the compiler doesn't split the stores of anon_vma and
1370	* the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
1371	* could mistake the mapping for a struct address_space and crash.
1372	*/
1373	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1374	WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
1375	folio->index = linear_page_index(vma, address);
1376	}
1377
1378	/**
1379	* __page_check_anon_rmap - sanity check anonymous rmap addition
1380	* @folio: The folio containing @page.
1381	* @page: the page to check the mapping of
1382	* @vma: the vm area in which the mapping is added
1383	* @address: the user virtual address mapped
1384	*/
1385	static void __page_check_anon_rmap(const struct folio *folio,
1386	const struct page page, struct* vm_area_struct *vma,
1387	unsigned long address)
1388	{
1389	/*
1390	* The page's anon-rmap details (mapping and index) are guaranteed to
1391	* be set up correctly at this point.
1392	*
1393	* We have exclusion against folio_add_anon_rmap_*() because the caller
1394	* always holds the page locked.
1395	*
1396	* We have exclusion against folio_add_new_anon_rmap because those pages
1397	* are initially only visible via the pagetables, and the pte is locked
1398	* over the call to folio_add_new_anon_rmap.
1399	*/
1400	VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
1401	folio);
1402	VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
1403	page);
1404	}
1405
1406	static void __folio_mod_stat(struct folio folio, int* nr, int nr_pmdmapped)
1407	{
1408	int idx;
1409
1410	if (nr) {
1411	idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
1412	__lruvec_stat_mod_folio(folio, idx, val: nr);
1413	}
1414	if (nr_pmdmapped) {
1415	if (folio_test_anon(folio)) {
1416	idx = NR_ANON_THPS;
1417	__lruvec_stat_mod_folio(folio, idx, val: nr_pmdmapped);
1418	} else {
1419	/ NR__PMDMAPPED are not maintained per-memcg /*
1420	idx = folio_test_swapbacked(folio) ?
1421	NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
1422	__mod_node_page_state(folio_pgdat(folio), item: idx,
1423	nr_pmdmapped);
1424	}
1425	}
1426	}
1427
1428	static __always_inline void __folio_add_anon_rmap(struct folio *folio,
1429	struct page page, int* nr_pages, struct vm_area_struct *vma,
1430	unsigned long address, rmap_t flags, enum rmap_level level)
1431	{
1432	int i, nr, nr_pmdmapped = `0`;
1433
1434	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
1435
1436	nr = __folio_add_rmap(folio, page, nr_pages, vma, level, nr_pmdmapped: &nr_pmdmapped);
1437
1438	if (likely(!folio_test_ksm(folio)))
1439	__page_check_anon_rmap(folio, page, vma, address);
1440
1441	__folio_mod_stat(folio, nr, nr_pmdmapped);
1442
1443	if (flags & RMAP_EXCLUSIVE) {
1444	switch (level) {
1445	case RMAP_LEVEL_PTE:
1446	for (i = `0`; i < nr_pages; i++)
1447	SetPageAnonExclusive(page + i);
1448	break;
1449	case RMAP_LEVEL_PMD:
1450	SetPageAnonExclusive(page);
1451	break;
1452	case RMAP_LEVEL_PUD:
1453	/*
1454	* Keep the compiler happy, we don't support anonymous
1455	* PUD mappings.
1456	*/
1457	WARN_ON_ONCE(`1`);
1458	break;
1459	}
1460	}
1461
1462	VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
1463	atomic_read(&folio->_mapcount) > `0`, folio);
1464	for (i = `0`; i < nr_pages; i++) {
1465	struct page *cur_page = page + i;
1466
1467	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
1468	folio_entire_mapcount(folio) > `1` &&
1469	PageAnonExclusive(cur_page), folio);
1470	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
1471	continue;
1472
1473	/*
1474	* While PTE-mapping a THP we have a PMD and a PTE
1475	* mapping.
1476	*/
1477	VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > `0` &&
1478	PageAnonExclusive(cur_page), folio);
1479	}
1480
1481	/*
1482	* For large folio, only mlock it if it's fully mapped to VMA. It's
1483	* not easy to check whether the large folio is fully mapped to VMA
1484	* here. Only mlock normal 4K folio and leave page reclaim to handle
1485	* large folio.
1486	*/
1487	if (!folio_test_large(folio))
1488	mlock_vma_folio(folio, vma);
1489	}
1490
1491	/**
1492	* folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
1493	* @folio: The folio to add the mappings to
1494	* @page: The first page to add
1495	* @nr_pages: The number of pages which will be mapped
1496	* @vma: The vm area in which the mappings are added
1497	* @address: The user virtual address of the first page to map
1498	* @flags: The rmap flags
1499	*
1500	* The page range of folio is defined by [first_page, first_page + nr_pages)
1501	*
1502	* The caller needs to hold the page table lock, and the page must be locked in
1503	* the anon_vma case: to serialize mapping,index checking after setting,
1504	* and to ensure that an anon folio is not being upgraded racily to a KSM folio
1505	* (but KSM folios are never downgraded).
1506	*/
1507	void folio_add_anon_rmap_ptes(struct folio folio, struct* page *page,
1508	int nr_pages, struct vm_area_struct vma, unsigned* long address,
1509	rmap_t flags)
1510	{
1511	__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
1512	level: RMAP_LEVEL_PTE);
1513	}
1514
1515	/**
1516	* folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
1517	* @folio: The folio to add the mapping to
1518	* @page: The first page to add
1519	* @vma: The vm area in which the mapping is added
1520	* @address: The user virtual address of the first page to map
1521	* @flags: The rmap flags
1522	*
1523	* The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
1524	*
1525	* The caller needs to hold the page table lock, and the page must be locked in
1526	* the anon_vma case: to serialize mapping,index checking after setting.
1527	*/
1528	void folio_add_anon_rmap_pmd(struct folio folio, struct* page *page,
1529	struct vm_area_struct vma, unsigned* long address, rmap_t flags)
1530	{
1531	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1532	__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
1533	level: RMAP_LEVEL_PMD);
1534	#else
1535	WARN_ON_ONCE(true);
1536	#endif
1537	}
1538
1539	/**
1540	* folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
1541	* @folio: The folio to add the mapping to.
1542	* @vma: the vm area in which the mapping is added
1543	* @address: the user virtual address mapped
1544	* @flags: The rmap flags
1545	*
1546	* Like folio_add_anon_rmap_() but must only be called on new* folios.
1547	* This means the inc-and-test can be bypassed.
1548	* The folio doesn't necessarily need to be locked while it's exclusive
1549	* unless two threads map it concurrently. However, the folio must be
1550	* locked if it's shared.
1551	*
1552	* If the folio is pmd-mappable, it is accounted as a THP.
1553	*/
1554	void folio_add_new_anon_rmap(struct folio folio, struct* vm_area_struct *vma,
1555	unsigned long address, rmap_t flags)
1556	{
1557	const bool exclusive = flags & RMAP_EXCLUSIVE;
1558	int nr = `1`, nr_pmdmapped = `0`;
1559
1560	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1561	VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
1562
1563	/*
1564	* VM_DROPPABLE mappings don't swap; instead they're just dropped when
1565	* under memory pressure.
1566	*/
1567	if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
1568	__folio_set_swapbacked(folio);
1569	__folio_set_anon(folio, vma, address, exclusive);
1570
1571	if (likely(!folio_test_large(folio))) {
1572	/ increment count (starts at -1) /
1573	atomic_set(v: &folio->_mapcount, i: `0`);
1574	if (exclusive)
1575	SetPageAnonExclusive(&folio->page);
1576	} else if (!folio_test_pmd_mappable(folio)) {
1577	int i;
1578
1579	nr = folio_large_nr_pages(folio);
1580	for (i = `0`; i < nr; i++) {
1581	struct page *page = folio_page(folio, i);
1582
1583	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1584	/ increment count (starts at -1) /
1585	atomic_set(v: &page->_mapcount, i: `0`);
1586	if (exclusive)
1587	SetPageAnonExclusive(page);
1588	}
1589
1590	folio_set_large_mapcount(folio, mapcount: nr, vma);
1591	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1592	atomic_set(v: &folio->_nr_pages_mapped, i: nr);
1593	} else {
1594	nr = folio_large_nr_pages(folio);
1595	/ increment count (starts at -1) /
1596	atomic_set(v: &folio->_entire_mapcount, i: `0`);
1597	folio_set_large_mapcount(folio, mapcount: `1`, vma);
1598	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1599	atomic_set(v: &folio->_nr_pages_mapped, ENTIRELY_MAPPED);
1600	if (exclusive)
1601	SetPageAnonExclusive(&folio->page);
1602	nr_pmdmapped = nr;
1603	}
1604
1605	VM_WARN_ON_ONCE(address < vma->vm_start \|\|
1606	address + (nr << PAGE_SHIFT) > vma->vm_end);
1607
1608	__folio_mod_stat(folio, nr, nr_pmdmapped);
1609	mod_mthp_stat(order: folio_order(folio), item: MTHP_STAT_NR_ANON, delta: `1`);
1610	}
1611
1612	static __always_inline void __folio_add_file_rmap(struct folio *folio,
1613	struct page page, int* nr_pages, struct vm_area_struct *vma,
1614	enum rmap_level level)
1615	{
1616	int nr, nr_pmdmapped = `0`;
1617
1618	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
1619
1620	nr = __folio_add_rmap(folio, page, nr_pages, vma, level, nr_pmdmapped: &nr_pmdmapped);
1621	__folio_mod_stat(folio, nr, nr_pmdmapped);
1622
1623	/ See comments in folio_add_anon_rmap_() /*
1624	if (!folio_test_large(folio))
1625	mlock_vma_folio(folio, vma);
1626	}
1627
1628	/**
1629	* folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
1630	* @folio: The folio to add the mappings to
1631	* @page: The first page to add
1632	* @nr_pages: The number of pages that will be mapped using PTEs
1633	* @vma: The vm area in which the mappings are added
1634	*
1635	* The page range of the folio is defined by [page, page + nr_pages)
1636	*
1637	* The caller needs to hold the page table lock.
1638	*/
1639	void folio_add_file_rmap_ptes(struct folio folio, struct* page *page,
1640	int nr_pages, struct vm_area_struct *vma)
1641	{
1642	__folio_add_file_rmap(folio, page, nr_pages, vma, level: RMAP_LEVEL_PTE);
1643	}
1644
1645	/**
1646	* folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
1647	* @folio: The folio to add the mapping to
1648	* @page: The first page to add
1649	* @vma: The vm area in which the mapping is added
1650	*
1651	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1652	*
1653	* The caller needs to hold the page table lock.
1654	*/
1655	void folio_add_file_rmap_pmd(struct folio folio, struct* page *page,
1656	struct vm_area_struct *vma)
1657	{
1658	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1659	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, level: RMAP_LEVEL_PMD);
1660	#else
1661	WARN_ON_ONCE(true);
1662	#endif
1663	}
1664
1665	/**
1666	* folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
1667	* @folio: The folio to add the mapping to
1668	* @page: The first page to add
1669	* @vma: The vm area in which the mapping is added
1670	*
1671	* The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1672	*
1673	* The caller needs to hold the page table lock.
1674	*/
1675	void folio_add_file_rmap_pud(struct folio folio, struct* page *page,
1676	struct vm_area_struct *vma)
1677	{
1678	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1679	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1680	__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, level: RMAP_LEVEL_PUD);
1681	#else
1682	WARN_ON_ONCE(true);
1683	#endif
1684	}
1685
1686	static __always_inline void __folio_remove_rmap(struct folio *folio,
1687	struct page page, int* nr_pages, struct vm_area_struct *vma,
1688	enum rmap_level level)
1689	{
1690	atomic_t *mapped = &folio->_nr_pages_mapped;
1691	int last = `0`, nr = `0`, nr_pmdmapped = `0`;
1692	bool partially_mapped = false;
1693
1694	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
1695
1696	switch (level) {
1697	case RMAP_LEVEL_PTE:
1698	if (!folio_test_large(folio)) {
1699	nr = atomic_add_negative(i: -`1`, v: &folio->_mapcount);
1700	break;
1701	}
1702
1703	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1704	nr = folio_sub_return_large_mapcount(folio, diff: nr_pages, vma);
1705	if (!nr) {
1706	/ Now completely unmapped. /
1707	nr = folio_nr_pages(folio);
1708	} else {
1709	partially_mapped = nr < folio_large_nr_pages(folio) &&
1710	!folio_entire_mapcount(folio);
1711	nr = `0`;
1712	}
1713	break;
1714	}
1715
1716	folio_sub_large_mapcount(folio, diff: nr_pages, vma);
1717	do {
1718	last += atomic_add_negative(i: -`1`, v: &page->_mapcount);
1719	} while (page++, --nr_pages > `0`);
1720
1721	if (last &&
1722	atomic_sub_return_relaxed(i: last, v: mapped) < ENTIRELY_MAPPED)
1723	nr = last;
1724
1725	partially_mapped = nr && atomic_read(v: mapped);
1726	break;
1727	case RMAP_LEVEL_PMD:
1728	case RMAP_LEVEL_PUD:
1729	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1730	last = atomic_add_negative(i: -`1`, v: &folio->_entire_mapcount);
1731	if (level == RMAP_LEVEL_PMD && last)
1732	nr_pmdmapped = folio_large_nr_pages(folio);
1733	nr = folio_dec_return_large_mapcount(folio, vma);
1734	if (!nr) {
1735	/ Now completely unmapped. /
1736	nr = folio_large_nr_pages(folio);
1737	} else {
1738	partially_mapped = last &&
1739	nr < folio_large_nr_pages(folio);
1740	nr = `0`;
1741	}
1742	break;
1743	}
1744
1745	folio_dec_large_mapcount(folio, vma);
1746	last = atomic_add_negative(i: -`1`, v: &folio->_entire_mapcount);
1747	if (last) {
1748	nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, v: mapped);
1749	if (likely(nr < ENTIRELY_MAPPED)) {
1750	nr_pages = folio_large_nr_pages(folio);
1751	if (level == RMAP_LEVEL_PMD)
1752	nr_pmdmapped = nr_pages;
1753	nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
1754	/ Raced ahead of another remove and an add? /
1755	if (unlikely(nr < `0`))
1756	nr = `0`;
1757	} else {
1758	/ An add of ENTIRELY_MAPPED raced ahead /
1759	nr = `0`;
1760	}
1761	}
1762
1763	partially_mapped = nr && nr < nr_pmdmapped;
1764	break;
1765	}
1766
1767	/*
1768	* Queue anon large folio for deferred split if at least one page of
1769	* the folio is unmapped and at least one page is still mapped.
1770	*
1771	* Check partially_mapped first to ensure it is a large folio.
1772	*/
1773	if (partially_mapped && folio_test_anon(folio) &&
1774	!folio_test_partially_mapped(folio))
1775	deferred_split_folio(folio, partially_mapped: true);
1776
1777	__folio_mod_stat(folio, nr: -nr, nr_pmdmapped: -nr_pmdmapped);
1778
1779	/*
1780	* It would be tidy to reset folio_test_anon mapping when fully
1781	* unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
1782	* which increments mapcount after us but sets mapping before us:
1783	* so leave the reset to free_pages_prepare, and remember that
1784	* it's only reliable while mapped.
1785	*/
1786
1787	munlock_vma_folio(folio, vma);
1788	}
1789
1790	/**
1791	* folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
1792	* @folio: The folio to remove the mappings from
1793	* @page: The first page to remove
1794	* @nr_pages: The number of pages that will be removed from the mapping
1795	* @vma: The vm area from which the mappings are removed
1796	*
1797	* The page range of the folio is defined by [page, page + nr_pages)
1798	*
1799	* The caller needs to hold the page table lock.
1800	*/
1801	void folio_remove_rmap_ptes(struct folio folio, struct* page *page,
1802	int nr_pages, struct vm_area_struct *vma)
1803	{
1804	__folio_remove_rmap(folio, page, nr_pages, vma, level: RMAP_LEVEL_PTE);
1805	}
1806
1807	/**
1808	* folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
1809	* @folio: The folio to remove the mapping from
1810	* @page: The first page to remove
1811	* @vma: The vm area from which the mapping is removed
1812	*
1813	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1814	*
1815	* The caller needs to hold the page table lock.
1816	*/
1817	void folio_remove_rmap_pmd(struct folio folio, struct* page *page,
1818	struct vm_area_struct *vma)
1819	{
1820	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1821	__folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, level: RMAP_LEVEL_PMD);
1822	#else
1823	WARN_ON_ONCE(true);
1824	#endif
1825	}
1826
1827	/**
1828	* folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
1829	* @folio: The folio to remove the mapping from
1830	* @page: The first page to remove
1831	* @vma: The vm area from which the mapping is removed
1832	*
1833	* The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1834	*
1835	* The caller needs to hold the page table lock.
1836	*/
1837	void folio_remove_rmap_pud(struct folio folio, struct* page *page,
1838	struct vm_area_struct *vma)
1839	{
1840	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1841	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1842	__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, level: RMAP_LEVEL_PUD);
1843	#else
1844	WARN_ON_ONCE(true);
1845	#endif
1846	}
1847
1848	/ We support batch unmapping of PTEs for lazyfree large folios /
1849	static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
1850	struct folio folio, pte_t ptep)
1851	{
1852	const fpb_t fpb_flags = FPB_IGNORE_DIRTY \| FPB_IGNORE_SOFT_DIRTY;
1853	int max_nr = folio_nr_pages(folio);
1854	pte_t pte = ptep_get(ptep);
1855
1856	if (!folio_test_anon(folio) \|\| folio_test_swapbacked(folio))
1857	return false;
1858	if (pte_unused(pte))
1859	return false;
1860	if (pte_pfn(pte) != folio_pfn(folio))
1861	return false;
1862
1863	return folio_pte_batch(folio, addr, start_ptep: ptep, pte, max_nr, flags: fpb_flags, NULL,
1864	NULL, NULL) == max_nr;
1865	}
1866
1867	/*
1868	* @arg: enum ttu_flags will be passed to this argument
1869	*/
1870	static bool try_to_unmap_one(struct folio folio, struct* vm_area_struct *vma,
1871	unsigned long address, void *arg)
1872	{
1873	struct mm_struct *mm = vma->vm_mm;
1874	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, `0`);
1875	bool anon_exclusive, ret = true;
1876	pte_t pteval;
1877	struct page *subpage;
1878	struct mmu_notifier_range range;
1879	enum ttu_flags flags = (enum ttu_flags)(long)arg;
1880	unsigned long nr_pages = `1`, end_addr;
1881	unsigned long pfn;
1882	unsigned long hsz = `0`;
1883
1884	/*
1885	* When racing against e.g. zap_pte_range() on another cpu,
1886	* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
1887	* try_to_unmap() may return before page_mapped() has become false,
1888	* if page table locking is skipped: use TTU_SYNC to wait for that.
1889	*/
1890	if (flags & TTU_SYNC)
1891	pvmw.flags = PVMW_SYNC;
1892
1893	/*
1894	* For THP, we have to assume the worse case ie pmd for invalidation.
1895	* For hugetlb, it could be much worse if we need to do pud
1896	* invalidation in the case of pmd sharing.
1897	*
1898	* Note that the folio can not be freed in this function as call of
1899	* try_to_unmap() must hold a reference on the folio.
1900	*/
1901	range.end = vma_address_end(pvmw: &pvmw);
1902	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
1903	start: address, end: range.end);
1904	if (folio_test_hugetlb(folio)) {
1905	/*
1906	* If sharing is possible, start and end will be adjusted
1907	* accordingly.
1908	*/
1909	adjust_range_if_pmd_sharing_possible(vma, start: &range.start,
1910	end: &range.end);
1911
1912	/ We need the huge page size for set_huge_pte_at() /
1913	hsz = huge_page_size(h: hstate_vma(vma));
1914	}
1915	mmu_notifier_invalidate_range_start(range: &range);
1916
1917	while (page_vma_mapped_walk(pvmw: &pvmw)) {
1918	/*
1919	* If the folio is in an mlock()d vma, we must not swap it out.
1920	*/
1921	if (!(flags & TTU_IGNORE_MLOCK) &&
1922	(vma->vm_flags & VM_LOCKED)) {
1923	/ Restore the mlock which got missed /
1924	if (!folio_test_large(folio))
1925	mlock_vma_folio(folio, vma);
1926	goto walk_abort;
1927	}
1928
1929	if (!pvmw.pte) {
1930	if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1931	if (unmap_huge_pmd_locked(vma, addr: pvmw.address, pmdp: pvmw.pmd, folio))
1932	goto walk_done;
1933	/*
1934	* unmap_huge_pmd_locked has either already marked
1935	* the folio as swap-backed or decided to retain it
1936	* due to GUP or speculative references.
1937	*/
1938	goto walk_abort;
1939	}
1940
1941	if (flags & TTU_SPLIT_HUGE_PMD) {
1942	/*
1943	* We temporarily have to drop the PTL and
1944	* restart so we can process the PTE-mapped THP.
1945	*/
1946	split_huge_pmd_locked(vma, address: pvmw.address,
1947	pmd: pvmw.pmd, freeze: false);
1948	flags &= ~TTU_SPLIT_HUGE_PMD;
1949	page_vma_mapped_walk_restart(pvmw: &pvmw);
1950	continue;
1951	}
1952	}
1953
1954	/ Unexpected PMD-mapped THP? /
1955	VM_BUG_ON_FOLIO(!pvmw.pte, folio);
1956
1957	/*
1958	* Handle PFN swap PTEs, such as device-exclusive ones, that
1959	* actually map pages.
1960	*/
1961	pteval = ptep_get(ptep: pvmw.pte);
1962	if (likely(pte_present(pteval))) {
1963	pfn = pte_pfn(pte: pteval);
1964	} else {
1965	pfn = swp_offset_pfn(entry: pte_to_swp_entry(pte: pteval));
1966	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1967	}
1968
1969	subpage = folio_page(folio, pfn - folio_pfn(folio));
1970	address = pvmw.address;
1971	anon_exclusive = folio_test_anon(folio) &&
1972	PageAnonExclusive(page: subpage);
1973
1974	if (folio_test_hugetlb(folio)) {
1975	bool anon = folio_test_anon(folio);
1976
1977	/*
1978	* The try_to_unmap() is only passed a hugetlb page
1979	* in the case where the hugetlb page is poisoned.
1980	*/
1981	VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
1982	/*
1983	* huge_pmd_unshare may unmap an entire PMD page.
1984	* There is no way of knowing exactly which PMDs may
1985	* be cached for this mm, so we must flush them all.
1986	* start/end were already adjusted above to cover this
1987	* range.
1988	*/
1989	flush_cache_range(vma, start: range.start, end: range.end);
1990
1991	/*
1992	* To call huge_pmd_unshare, i_mmap_rwsem must be
1993	* held in write mode. Caller needs to explicitly
1994	* do this outside rmap routines.
1995	*
1996	* We also must hold hugetlb vma_lock in write mode.
1997	* Lock order dictates acquiring vma_lock BEFORE
1998	* i_mmap_rwsem. We can only try lock here and fail
1999	* if unsuccessful.
2000	*/
2001	if (!anon) {
2002	VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2003	if (!hugetlb_vma_trylock_write(vma))
2004	goto walk_abort;
2005	if (huge_pmd_unshare(mm, vma, addr: address, ptep: pvmw.pte)) {
2006	hugetlb_vma_unlock_write(vma);
2007	flush_tlb_range(vma,
2008	range.start, range.end);
2009	/*
2010	* The ref count of the PMD page was
2011	* dropped which is part of the way map
2012	* counting is done for shared PMDs.
2013	* Return 'true' here. When there is
2014	* no other sharing, huge_pmd_unshare
2015	* returns false and we will unmap the
2016	* actual page and drop map count
2017	* to zero.
2018	*/
2019	goto walk_done;
2020	}
2021	hugetlb_vma_unlock_write(vma);
2022	}
2023	pteval = huge_ptep_clear_flush(vma, addr: address, ptep: pvmw.pte);
2024	if (pte_dirty(pte: pteval))
2025	folio_mark_dirty(folio);
2026	} else if (likely(pte_present(pteval))) {
2027	if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
2028	can_batch_unmap_folio_ptes(addr: address, folio, ptep: pvmw.pte))
2029	nr_pages = folio_nr_pages(folio);
2030	end_addr = address + nr_pages * PAGE_SIZE;
2031	flush_cache_range(vma, start: address, end: end_addr);
2032
2033	/ Nuke the page table entry. /
2034	pteval = get_and_clear_full_ptes(mm, addr: address, ptep: pvmw.pte, nr: nr_pages, full: `0`);
2035	/*
2036	* We clear the PTE but do not flush so potentially
2037	* a remote CPU could still be writing to the folio.
2038	* If the entry was previously clean then the
2039	* architecture must guarantee that a clear->dirty
2040	* transition on a cached TLB entry is written through
2041	* and traps if the PTE is unmapped.
2042	*/
2043	if (should_defer_flush(mm, flags))
2044	set_tlb_ubc_flush_pending(mm, pteval, start: address, end: end_addr);
2045	else
2046	flush_tlb_range(vma, address, end_addr);
2047	if (pte_dirty(pte: pteval))
2048	folio_mark_dirty(folio);
2049	} else {
2050	pte_clear(mm, addr: address, ptep: pvmw.pte);
2051	}
2052
2053	/*
2054	* Now the pte is cleared. If this pte was uffd-wp armed,
2055	* we may want to replace a none pte with a marker pte if
2056	* it's file-backed, so we don't lose the tracking info.
2057	*/
2058	pte_install_uffd_wp_if_needed(vma, addr: address, pte: pvmw.pte, pteval);
2059
2060	/ Update high watermark before we lower rss /
2061	update_hiwater_rss(mm);
2062
2063	if (PageHWPoison(page: subpage) && (flags & TTU_HWPOISON)) {
2064	pteval = swp_entry_to_pte(entry: make_hwpoison_entry(page: subpage));
2065	if (folio_test_hugetlb(folio)) {
2066	hugetlb_count_sub(l: folio_nr_pages(folio), mm);
2067	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte, pte: pteval,
2068	sz: hsz);
2069	} else {
2070	dec_mm_counter(mm, member: mm_counter(folio));
2071	set_pte_at(mm, address, pvmw.pte, pteval);
2072	}
2073	} else if (likely(pte_present(pteval)) && pte_unused(pte: pteval) &&
2074	!userfaultfd_armed(vma)) {
2075	/*
2076	* The guest indicated that the page content is of no
2077	* interest anymore. Simply discard the pte, vmscan
2078	* will take care of the rest.
2079	* A future reference will then fault in a new zero
2080	* page. When userfaultfd is active, we must not drop
2081	* this page though, as its main user (postcopy
2082	* migration) will not expect userfaults on already
2083	* copied pages.
2084	*/
2085	dec_mm_counter(mm, member: mm_counter(folio));
2086	} else if (folio_test_anon(folio)) {
2087	swp_entry_t entry = page_swap_entry(page: subpage);
2088	pte_t swp_pte;
2089	/*
2090	* Store the swap location in the pte.
2091	* See handle_pte_fault() ...
2092	*/
2093	if (unlikely(folio_test_swapbacked(folio) !=
2094	folio_test_swapcache(folio))) {
2095	WARN_ON_ONCE(`1`);
2096	goto walk_abort;
2097	}
2098
2099	/ MADV_FREE page check /
2100	if (!folio_test_swapbacked(folio)) {
2101	int ref_count, map_count;
2102
2103	/*
2104	* Synchronize with gup_pte_range():
2105	* - clear PTE; barrier; read refcount
2106	* - inc refcount; barrier; read PTE
2107	*/
2108	smp_mb();
2109
2110	ref_count = folio_ref_count(folio);
2111	map_count = folio_mapcount(folio);
2112
2113	/*
2114	* Order reads for page refcount and dirty flag
2115	* (see comments in __remove_mapping()).
2116	*/
2117	smp_rmb();
2118
2119	if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
2120	/*
2121	* redirtied either using the page table or a previously
2122	* obtained GUP reference.
2123	*/
2124	set_ptes(mm, addr: address, ptep: pvmw.pte, pte: pteval, nr: nr_pages);
2125	folio_set_swapbacked(folio);
2126	goto walk_abort;
2127	} else if (ref_count != `1` + map_count) {
2128	/*
2129	* Additional reference. Could be a GUP reference or any
2130	* speculative reference. GUP users must mark the folio
2131	* dirty if there was a modification. This folio cannot be
2132	* reclaimed right now either way, so act just like nothing
2133	* happened.
2134	* We'll come back here later and detect if the folio was
2135	* dirtied when the additional reference is gone.
2136	*/
2137	set_ptes(mm, addr: address, ptep: pvmw.pte, pte: pteval, nr: nr_pages);
2138	goto walk_abort;
2139	}
2140	add_mm_counter(mm, member: MM_ANONPAGES, value: -nr_pages);
2141	goto discard;
2142	}
2143
2144	if (swap_duplicate(entry) < `0`) {
2145	set_pte_at(mm, address, pvmw.pte, pteval);
2146	goto walk_abort;
2147	}
2148
2149	/*
2150	* arch_unmap_one() is expected to be a NOP on
2151	* architectures where we could have PFN swap PTEs,
2152	* so we'll not check/care.
2153	*/
2154	if (arch_unmap_one(mm, vma, addr: address, orig_pte: pteval) < `0`) {
2155	swap_free(entry);
2156	set_pte_at(mm, address, pvmw.pte, pteval);
2157	goto walk_abort;
2158	}
2159
2160	/ See folio_try_share_anon_rmap(): clear PTE first. /
2161	if (anon_exclusive &&
2162	folio_try_share_anon_rmap_pte(folio, page: subpage)) {
2163	swap_free(entry);
2164	set_pte_at(mm, address, pvmw.pte, pteval);
2165	goto walk_abort;
2166	}
2167	if (list_empty(head: &mm->mmlist)) {
2168	spin_lock(lock: &mmlist_lock);
2169	if (list_empty(head: &mm->mmlist))
2170	list_add(new: &mm->mmlist, head: &init_mm.mmlist);
2171	spin_unlock(lock: &mmlist_lock);
2172	}
2173	dec_mm_counter(mm, member: MM_ANONPAGES);
2174	inc_mm_counter(mm, member: MM_SWAPENTS);
2175	swp_pte = swp_entry_to_pte(entry);
2176	if (anon_exclusive)
2177	swp_pte = pte_swp_mkexclusive(pte: swp_pte);
2178	if (likely(pte_present(pteval))) {
2179	if (pte_soft_dirty(pte: pteval))
2180	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2181	if (pte_uffd_wp(pte: pteval))
2182	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2183	} else {
2184	if (pte_swp_soft_dirty(pte: pteval))
2185	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2186	if (pte_swp_uffd_wp(pte: pteval))
2187	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2188	}
2189	set_pte_at(mm, address, pvmw.pte, swp_pte);
2190	} else {
2191	/*
2192	* This is a locked file-backed folio,
2193	* so it cannot be removed from the page
2194	* cache and replaced by a new folio before
2195	* mmu_notifier_invalidate_range_end, so no
2196	* concurrent thread might update its page table
2197	* to point at a new folio while a device is
2198	* still using this folio.
2199	*
2200	* See Documentation/mm/mmu_notifier.rst
2201	*/
2202	dec_mm_counter(mm, member: mm_counter_file(folio));
2203	}
2204	discard:
2205	if (unlikely(folio_test_hugetlb(folio))) {
2206	hugetlb_remove_rmap(folio);
2207	} else {
2208	folio_remove_rmap_ptes(folio, page: subpage, nr_pages, vma);
2209	folio_ref_sub(folio, nr: nr_pages - `1`);
2210	}
2211	if (vma->vm_flags & VM_LOCKED)
2212	mlock_drain_local();
2213	folio_put(folio);
2214	/ We have already batched the entire folio /
2215	if (nr_pages > `1`)
2216	goto walk_done;
2217	continue;
2218	walk_abort:
2219	ret = false;
2220	walk_done:
2221	page_vma_mapped_walk_done(pvmw: &pvmw);
2222	break;
2223	}
2224
2225	mmu_notifier_invalidate_range_end(range: &range);
2226
2227	return ret;
2228	}
2229
2230	static bool invalid_migration_vma(struct vm_area_struct vma, void* *arg)
2231	{
2232	return vma_is_temporary_stack(vma);
2233	}
2234
2235	static int folio_not_mapped(struct folio *folio)
2236	{
2237	return !folio_mapped(folio);
2238	}
2239
2240	/**
2241	* try_to_unmap - Try to remove all page table mappings to a folio.
2242	* @folio: The folio to unmap.
2243	* @flags: action and flags
2244	*
2245	* Tries to remove all the page table entries which are mapping this
2246	* folio. It is the caller's responsibility to check if the folio is
2247	* still mapped if needed (use TTU_SYNC to prevent accounting races).
2248	*
2249	* Context: Caller must hold the folio lock.
2250	*/
2251	void try_to_unmap(struct folio folio, enum* ttu_flags flags)
2252	{
2253	struct rmap_walk_control rwc = {
2254	.rmap_one = try_to_unmap_one,
2255	.arg = (void *)flags,
2256	.done = folio_not_mapped,
2257	.anon_lock = folio_lock_anon_vma_read,
2258	};
2259
2260	if (flags & TTU_RMAP_LOCKED)
2261	rmap_walk_locked(folio, rwc: &rwc);
2262	else
2263	rmap_walk(folio, rwc: &rwc);
2264	}
2265
2266	/*
2267	* @arg: enum ttu_flags will be passed to this argument.
2268	*
2269	* If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
2270	* containing migration entries.
2271	*/
2272	static bool try_to_migrate_one(struct folio folio, struct* vm_area_struct *vma,
2273	unsigned long address, void *arg)
2274	{
2275	struct mm_struct *mm = vma->vm_mm;
2276	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, `0`);
2277	bool anon_exclusive, writable, ret = true;
2278	pte_t pteval;
2279	struct page *subpage;
2280	struct mmu_notifier_range range;
2281	enum ttu_flags flags = (enum ttu_flags)(long)arg;
2282	unsigned long pfn;
2283	unsigned long hsz = `0`;
2284
2285	/*
2286	* When racing against e.g. zap_pte_range() on another cpu,
2287	* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
2288	* try_to_migrate() may return before page_mapped() has become false,
2289	* if page table locking is skipped: use TTU_SYNC to wait for that.
2290	*/
2291	if (flags & TTU_SYNC)
2292	pvmw.flags = PVMW_SYNC;
2293
2294	/*
2295	* For THP, we have to assume the worse case ie pmd for invalidation.
2296	* For hugetlb, it could be much worse if we need to do pud
2297	* invalidation in the case of pmd sharing.
2298	*
2299	* Note that the page can not be free in this function as call of
2300	* try_to_unmap() must hold a reference on the page.
2301	*/
2302	range.end = vma_address_end(pvmw: &pvmw);
2303	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2304	start: address, end: range.end);
2305	if (folio_test_hugetlb(folio)) {
2306	/*
2307	* If sharing is possible, start and end will be adjusted
2308	* accordingly.
2309	*/
2310	adjust_range_if_pmd_sharing_possible(vma, start: &range.start,
2311	end: &range.end);
2312
2313	/ We need the huge page size for set_huge_pte_at() /
2314	hsz = huge_page_size(h: hstate_vma(vma));
2315	}
2316	mmu_notifier_invalidate_range_start(range: &range);
2317
2318	while (page_vma_mapped_walk(pvmw: &pvmw)) {
2319	/ PMD-mapped THP migration entry /
2320	if (!pvmw.pte) {
2321	if (flags & TTU_SPLIT_HUGE_PMD) {
2322	split_huge_pmd_locked(vma, address: pvmw.address,
2323	pmd: pvmw.pmd, freeze: true);
2324	ret = false;
2325	page_vma_mapped_walk_done(pvmw: &pvmw);
2326	break;
2327	}
2328	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2329	subpage = folio_page(folio,
2330	pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
2331	VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) \|\|
2332	!folio_test_pmd_mappable(folio), folio);
2333
2334	if (set_pmd_migration_entry(pvmw: &pvmw, page: subpage)) {
2335	ret = false;
2336	page_vma_mapped_walk_done(pvmw: &pvmw);
2337	break;
2338	}
2339	continue;
2340	#endif
2341	}
2342
2343	/ Unexpected PMD-mapped THP? /
2344	VM_BUG_ON_FOLIO(!pvmw.pte, folio);
2345
2346	/*
2347	* Handle PFN swap PTEs, such as device-exclusive ones, that
2348	* actually map pages.
2349	*/
2350	pteval = ptep_get(ptep: pvmw.pte);
2351	if (likely(pte_present(pteval))) {
2352	pfn = pte_pfn(pte: pteval);
2353	} else {
2354	pfn = swp_offset_pfn(entry: pte_to_swp_entry(pte: pteval));
2355	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
2356	}
2357
2358	subpage = folio_page(folio, pfn - folio_pfn(folio));
2359	address = pvmw.address;
2360	anon_exclusive = folio_test_anon(folio) &&
2361	PageAnonExclusive(page: subpage);
2362
2363	if (folio_test_hugetlb(folio)) {
2364	bool anon = folio_test_anon(folio);
2365
2366	/*
2367	* huge_pmd_unshare may unmap an entire PMD page.
2368	* There is no way of knowing exactly which PMDs may
2369	* be cached for this mm, so we must flush them all.
2370	* start/end were already adjusted above to cover this
2371	* range.
2372	*/
2373	flush_cache_range(vma, start: range.start, end: range.end);
2374
2375	/*
2376	* To call huge_pmd_unshare, i_mmap_rwsem must be
2377	* held in write mode. Caller needs to explicitly
2378	* do this outside rmap routines.
2379	*
2380	* We also must hold hugetlb vma_lock in write mode.
2381	* Lock order dictates acquiring vma_lock BEFORE
2382	* i_mmap_rwsem. We can only try lock here and
2383	* fail if unsuccessful.
2384	*/
2385	if (!anon) {
2386	VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2387	if (!hugetlb_vma_trylock_write(vma)) {
2388	page_vma_mapped_walk_done(pvmw: &pvmw);
2389	ret = false;
2390	break;
2391	}
2392	if (huge_pmd_unshare(mm, vma, addr: address, ptep: pvmw.pte)) {
2393	hugetlb_vma_unlock_write(vma);
2394	flush_tlb_range(vma,
2395	range.start, range.end);
2396
2397	/*
2398	* The ref count of the PMD page was
2399	* dropped which is part of the way map
2400	* counting is done for shared PMDs.
2401	* Return 'true' here. When there is
2402	* no other sharing, huge_pmd_unshare
2403	* returns false and we will unmap the
2404	* actual page and drop map count
2405	* to zero.
2406	*/
2407	page_vma_mapped_walk_done(pvmw: &pvmw);
2408	break;
2409	}
2410	hugetlb_vma_unlock_write(vma);
2411	}
2412	/ Nuke the hugetlb page table entry /
2413	pteval = huge_ptep_clear_flush(vma, addr: address, ptep: pvmw.pte);
2414	if (pte_dirty(pte: pteval))
2415	folio_mark_dirty(folio);
2416	writable = pte_write(pte: pteval);
2417	} else if (likely(pte_present(pteval))) {
2418	flush_cache_page(vma, vmaddr: address, pfn);
2419	/ Nuke the page table entry. /
2420	if (should_defer_flush(mm, flags)) {
2421	/*
2422	* We clear the PTE but do not flush so potentially
2423	* a remote CPU could still be writing to the folio.
2424	* If the entry was previously clean then the
2425	* architecture must guarantee that a clear->dirty
2426	* transition on a cached TLB entry is written through
2427	* and traps if the PTE is unmapped.
2428	*/
2429	pteval = ptep_get_and_clear(mm, addr: address, ptep: pvmw.pte);
2430
2431	set_tlb_ubc_flush_pending(mm, pteval, start: address, end: address + PAGE_SIZE);
2432	} else {
2433	pteval = ptep_clear_flush(vma, address, ptep: pvmw.pte);
2434	}
2435	if (pte_dirty(pte: pteval))
2436	folio_mark_dirty(folio);
2437	writable = pte_write(pte: pteval);
2438	} else {
2439	pte_clear(mm, addr: address, ptep: pvmw.pte);
2440	writable = is_writable_device_private_entry(entry: pte_to_swp_entry(pte: pteval));
2441	}
2442
2443	VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
2444	!anon_exclusive, folio);
2445
2446	/ Update high watermark before we lower rss /
2447	update_hiwater_rss(mm);
2448
2449	if (PageHWPoison(page: subpage)) {
2450	VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
2451
2452	pteval = swp_entry_to_pte(entry: make_hwpoison_entry(page: subpage));
2453	if (folio_test_hugetlb(folio)) {
2454	hugetlb_count_sub(l: folio_nr_pages(folio), mm);
2455	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte, pte: pteval,
2456	sz: hsz);
2457	} else {
2458	dec_mm_counter(mm, member: mm_counter(folio));
2459	set_pte_at(mm, address, pvmw.pte, pteval);
2460	}
2461	} else if (likely(pte_present(pteval)) && pte_unused(pte: pteval) &&
2462	!userfaultfd_armed(vma)) {
2463	/*
2464	* The guest indicated that the page content is of no
2465	* interest anymore. Simply discard the pte, vmscan
2466	* will take care of the rest.
2467	* A future reference will then fault in a new zero
2468	* page. When userfaultfd is active, we must not drop
2469	* this page though, as its main user (postcopy
2470	* migration) will not expect userfaults on already
2471	* copied pages.
2472	*/
2473	dec_mm_counter(mm, member: mm_counter(folio));
2474	} else {
2475	swp_entry_t entry;
2476	pte_t swp_pte;
2477
2478	/*
2479	* arch_unmap_one() is expected to be a NOP on
2480	* architectures where we could have PFN swap PTEs,
2481	* so we'll not check/care.
2482	*/
2483	if (arch_unmap_one(mm, vma, addr: address, orig_pte: pteval) < `0`) {
2484	if (folio_test_hugetlb(folio))
2485	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte,
2486	pte: pteval, sz: hsz);
2487	else
2488	set_pte_at(mm, address, pvmw.pte, pteval);
2489	ret = false;
2490	page_vma_mapped_walk_done(pvmw: &pvmw);
2491	break;
2492	}
2493
2494	/ See folio_try_share_anon_rmap_pte(): clear PTE first. /
2495	if (folio_test_hugetlb(folio)) {
2496	if (anon_exclusive &&
2497	hugetlb_try_share_anon_rmap(folio)) {
2498	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte,
2499	pte: pteval, sz: hsz);
2500	ret = false;
2501	page_vma_mapped_walk_done(pvmw: &pvmw);
2502	break;
2503	}
2504	} else if (anon_exclusive &&
2505	folio_try_share_anon_rmap_pte(folio, page: subpage)) {
2506	set_pte_at(mm, address, pvmw.pte, pteval);
2507	ret = false;
2508	page_vma_mapped_walk_done(pvmw: &pvmw);
2509	break;
2510	}
2511
2512	/*
2513	* Store the pfn of the page in a special migration
2514	* pte. do_swap_page() will wait until the migration
2515	* pte is removed and then restart fault handling.
2516	*/
2517	if (writable)
2518	entry = make_writable_migration_entry(
2519	page_to_pfn(subpage));
2520	else if (anon_exclusive)
2521	entry = make_readable_exclusive_migration_entry(
2522	page_to_pfn(subpage));
2523	else
2524	entry = make_readable_migration_entry(
2525	page_to_pfn(subpage));
2526	if (likely(pte_present(pteval))) {
2527	if (pte_young(pte: pteval))
2528	entry = make_migration_entry_young(entry);
2529	if (pte_dirty(pte: pteval))
2530	entry = make_migration_entry_dirty(entry);
2531	swp_pte = swp_entry_to_pte(entry);
2532	if (pte_soft_dirty(pte: pteval))
2533	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2534	if (pte_uffd_wp(pte: pteval))
2535	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2536	} else {
2537	swp_pte = swp_entry_to_pte(entry);
2538	if (pte_swp_soft_dirty(pte: pteval))
2539	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2540	if (pte_swp_uffd_wp(pte: pteval))
2541	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2542	}
2543	if (folio_test_hugetlb(folio))
2544	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte, pte: swp_pte,
2545	sz: hsz);
2546	else
2547	set_pte_at(mm, address, pvmw.pte, swp_pte);
2548	trace_set_migration_pte(addr: address, pte: pte_val(pte: swp_pte),
2549	order: folio_order(folio));
2550	/*
2551	* No need to invalidate here it will synchronize on
2552	* against the special swap migration pte.
2553	*/
2554	}
2555
2556	if (unlikely(folio_test_hugetlb(folio)))
2557	hugetlb_remove_rmap(folio);
2558	else
2559	folio_remove_rmap_pte(folio, subpage, vma);
2560	if (vma->vm_flags & VM_LOCKED)
2561	mlock_drain_local();
2562	folio_put(folio);
2563	}
2564
2565	mmu_notifier_invalidate_range_end(range: &range);
2566
2567	return ret;
2568	}
2569
2570	/**
2571	* try_to_migrate - try to replace all page table mappings with swap entries
2572	* @folio: the folio to replace page table entries for
2573	* @flags: action and flags
2574	*
2575	* Tries to remove all the page table entries which are mapping this folio and
2576	* replace them with special swap entries. Caller must hold the folio lock.
2577	*/
2578	void try_to_migrate(struct folio folio, enum* ttu_flags flags)
2579	{
2580	struct rmap_walk_control rwc = {
2581	.rmap_one = try_to_migrate_one,
2582	.arg = (void *)flags,
2583	.done = folio_not_mapped,
2584	.anon_lock = folio_lock_anon_vma_read,
2585	};
2586
2587	/*
2588	* Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
2589	* TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
2590	*/
2591	if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED \| TTU_SPLIT_HUGE_PMD \|
2592	TTU_SYNC \| TTU_BATCH_FLUSH)))
2593	return;
2594
2595	if (folio_is_zone_device(folio) &&
2596	(!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
2597	return;
2598
2599	/*
2600	* During exec, a temporary VMA is setup and later moved.
2601	* The VMA is moved under the anon_vma lock but not the
2602	* page tables leading to a race where migration cannot
2603	* find the migration ptes. Rather than increasing the
2604	* locking requirements of exec(), migration skips
2605	* temporary VMAs until after exec() completes.
2606	*/
2607	if (!folio_test_ksm(folio) && folio_test_anon(folio))
2608	rwc.invalid_vma = invalid_migration_vma;
2609
2610	if (flags & TTU_RMAP_LOCKED)
2611	rmap_walk_locked(folio, rwc: &rwc);
2612	else
2613	rmap_walk(folio, rwc: &rwc);
2614	}
2615
2616	#ifdef CONFIG_DEVICE_PRIVATE
2617	/**
2618	* make_device_exclusive() - Mark a page for exclusive use by a device
2619	* @mm: mm_struct of associated target process
2620	* @addr: the virtual address to mark for exclusive device access
2621	* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
2622	* @foliop: folio pointer will be stored here on success.
2623	*
2624	* This function looks up the page mapped at the given address, grabs a
2625	* folio reference, locks the folio and replaces the PTE with special
2626	* device-exclusive PFN swap entry, preventing access through the process
2627	* page tables. The function will return with the folio locked and referenced.
2628	*
2629	* On fault, the device-exclusive entries are replaced with the original PTE
2630	* under folio lock, after calling MMU notifiers.
2631	*
2632	* Only anonymous non-hugetlb folios are supported and the VMA must have
2633	* write permissions such that we can fault in the anonymous page writable
2634	* in order to mark it exclusive. The caller must hold the mmap_lock in read
2635	* mode.
2636	*
2637	* A driver using this to program access from a device must use a mmu notifier
2638	* critical section to hold a device specific lock during programming. Once
2639	* programming is complete it should drop the folio lock and reference after
2640	* which point CPU access to the page will revoke the exclusive access.
2641	*
2642	* Notes:
2643	* #. This function always operates on individual PTEs mapping individual
2644	* pages. PMD-sized THPs are first remapped to be mapped by PTEs before
2645	* the conversion happens on a single PTE corresponding to @addr.
2646	* #. While concurrent access through the process page tables is prevented,
2647	* concurrent access through other page references (e.g., earlier GUP
2648	* invocation) is not handled and not supported.
2649	* #. device-exclusive entries are considered "clean" and "old" by core-mm.
2650	* Device drivers must update the folio state when informed by MMU
2651	* notifiers.
2652	*
2653	* Returns: pointer to mapped page on success, otherwise a negative error.
2654	*/
2655	struct page make_device_exclusive(struct* mm_struct mm, unsigned* long addr,
2656	void owner, struct* folio **foliop)
2657	{
2658	struct mmu_notifier_range range;
2659	struct folio folio, fw_folio;
2660	struct vm_area_struct *vma;
2661	struct folio_walk fw;
2662	struct page *page;
2663	swp_entry_t entry;
2664	pte_t swp_pte;
2665	int ret;
2666
2667	mmap_assert_locked(mm);
2668	addr = PAGE_ALIGN_DOWN(addr);
2669
2670	/*
2671	* Fault in the page writable and try to lock it; note that if the
2672	* address would already be marked for exclusive use by a device,
2673	* the GUP call would undo that first by triggering a fault.
2674	*
2675	* If any other device would already map this page exclusively, the
2676	* fault will trigger a conversion to an ordinary
2677	* (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
2678	*/
2679	retry:
2680	page = get_user_page_vma_remote(mm, addr,
2681	gup_flags: FOLL_GET \| FOLL_WRITE \| FOLL_SPLIT_PMD,
2682	vmap: &vma);
2683	if (IS_ERR(ptr: page))
2684	return page;
2685	folio = page_folio(page);
2686
2687	if (!folio_test_anon(folio) \|\| folio_test_hugetlb(folio)) {
2688	folio_put(folio);
2689	return ERR_PTR(error: -EOPNOTSUPP);
2690	}
2691
2692	ret = folio_lock_killable(folio);
2693	if (ret) {
2694	folio_put(folio);
2695	return ERR_PTR(error: ret);
2696	}
2697
2698	/*
2699	* Inform secondary MMUs that we are going to convert this PTE to
2700	* device-exclusive, such that they unmap it now. Note that the
2701	* caller must filter this event out to prevent livelocks.
2702	*/
2703	mmu_notifier_range_init_owner(range: &range, event: MMU_NOTIFY_EXCLUSIVE, flags: `0`,
2704	mm, start: addr, end: addr + PAGE_SIZE, owner);
2705	mmu_notifier_invalidate_range_start(range: &range);
2706
2707	/*
2708	* Let's do a second walk and make sure we still find the same page
2709	* mapped writable. Note that any page of an anonymous folio can
2710	* only be mapped writable using exactly one PTE ("exclusive"), so
2711	* there cannot be other mappings.
2712	*/
2713	fw_folio = folio_walk_start(fw: &fw, vma, addr, flags: `0`);
2714	if (fw_folio != folio \|\| fw.page != page \|\|
2715	fw.level != FW_LEVEL_PTE \|\| !pte_write(pte: fw.pte)) {
2716	if (fw_folio)
2717	folio_walk_end(&fw, vma);
2718	mmu_notifier_invalidate_range_end(range: &range);
2719	folio_unlock(folio);
2720	folio_put(folio);
2721	goto retry;
2722	}
2723
2724	/ Nuke the page table entry so we get the uptodate dirty bit. /
2725	flush_cache_page(vma, vmaddr: addr, page_to_pfn(page));
2726	fw.pte = ptep_clear_flush(vma, address: addr, ptep: fw.ptep);
2727
2728	/ Set the dirty flag on the folio now the PTE is gone. /
2729	if (pte_dirty(pte: fw.pte))
2730	folio_mark_dirty(folio);
2731
2732	/*
2733	* Store the pfn of the page in a special device-exclusive PFN swap PTE.
2734	* do_swap_page() will trigger the conversion back while holding the
2735	* folio lock.
2736	*/
2737	entry = make_device_exclusive_entry(page_to_pfn(page));
2738	swp_pte = swp_entry_to_pte(entry);
2739	if (pte_soft_dirty(pte: fw.pte))
2740	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2741	/ The pte is writable, uffd-wp does not apply. /
2742	set_pte_at(mm, addr, fw.ptep, swp_pte);
2743
2744	folio_walk_end(&fw, vma);
2745	mmu_notifier_invalidate_range_end(range: &range);
2746	*foliop = folio;
2747	return page;
2748	}
2749	EXPORT_SYMBOL_GPL(make_device_exclusive);
2750	#endif
2751
2752	void __put_anon_vma(struct anon_vma *anon_vma)
2753	{
2754	struct anon_vma *root = anon_vma->root;
2755
2756	anon_vma_free(anon_vma);
2757	if (root != anon_vma && atomic_dec_and_test(v: &root->refcount))
2758	anon_vma_free(anon_vma: root);
2759	}
2760
2761	static struct anon_vma rmap_walk_anon_lock(const* struct folio *folio,
2762	struct rmap_walk_control *rwc)
2763	{
2764	struct anon_vma *anon_vma;
2765
2766	if (rwc->anon_lock)
2767	return rwc->anon_lock(folio, rwc);
2768
2769	/*
2770	* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
2771	* because that depends on page_mapped(); but not all its usages
2772	* are holding mmap_lock. Users without mmap_lock are required to
2773	* take a reference count to prevent the anon_vma disappearing
2774	*/
2775	anon_vma = folio_anon_vma(folio);
2776	if (!anon_vma)
2777	return NULL;
2778
2779	if (anon_vma_trylock_read(anon_vma))
2780	goto out;
2781
2782	if (rwc->try_lock) {
2783	anon_vma = NULL;
2784	rwc->contended = true;
2785	goto out;
2786	}
2787
2788	anon_vma_lock_read(anon_vma);
2789	out:
2790	return anon_vma;
2791	}
2792
2793	/*
2794	* rmap_walk_anon - do something to anonymous page using the object-based
2795	* rmap method
2796	* @folio: the folio to be handled
2797	* @rwc: control variable according to each walk type
2798	* @locked: caller holds relevant rmap lock
2799	*
2800	* Find all the mappings of a folio using the mapping pointer and the vma
2801	* chains contained in the anon_vma struct it points to.
2802	*/
2803	static void rmap_walk_anon(struct folio *folio,
2804	struct rmap_walk_control *rwc, bool locked)
2805	{
2806	struct anon_vma *anon_vma;
2807	pgoff_t pgoff_start, pgoff_end;
2808	struct anon_vma_chain *avc;
2809
2810	if (locked) {
2811	anon_vma = folio_anon_vma(folio);
2812	/ anon_vma disappear under us? /
2813	VM_BUG_ON_FOLIO(!anon_vma, folio);
2814	} else {
2815	anon_vma = rmap_walk_anon_lock(folio, rwc);
2816	}
2817	if (!anon_vma)
2818	return;
2819
2820	pgoff_start = folio_pgoff(folio);
2821	pgoff_end = pgoff_start + folio_nr_pages(folio) - `1`;
2822	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
2823	pgoff_start, pgoff_end) {
2824	struct vm_area_struct *vma = avc->vma;
2825	unsigned long address = vma_address(vma, pgoff: pgoff_start,
2826	nr_pages: folio_nr_pages(folio));
2827
2828	VM_BUG_ON_VMA(address == -EFAULT, vma);
2829	cond_resched();
2830
2831	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2832	continue;
2833
2834	if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2835	break;
2836	if (rwc->done && rwc->done(folio))
2837	break;
2838	}
2839
2840	if (!locked)
2841	anon_vma_unlock_read(anon_vma);
2842	}
2843
2844	/**
2845	* __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
2846	* of a page mapped within a specified page cache object at a specified offset.
2847	*
2848	* @folio: Either the folio whose mappings to traverse, or if NULL,
2849	* the callbacks specified in @rwc will be configured such
2850	* as to be able to look up mappings correctly.
2851	* @mapping: The page cache object whose mapping VMAs we intend to
2852	* traverse. If @folio is non-NULL, this should be equal to
2853	* folio_mapping(folio).
2854	* @pgoff_start: The offset within @mapping of the page which we are
2855	* looking up. If @folio is non-NULL, this should be equal
2856	* to folio_pgoff(folio).
2857	* @nr_pages: The number of pages mapped by the mapping. If @folio is
2858	* non-NULL, this should be equal to folio_nr_pages(folio).
2859	* @rwc: The reverse mapping walk control object describing how
2860	* the traversal should proceed.
2861	* @locked: Is the @mapping already locked? If not, we acquire the
2862	* lock.
2863	*/
2864	static void __rmap_walk_file(struct folio folio, struct* address_space *mapping,
2865	pgoff_t pgoff_start, unsigned long nr_pages,
2866	struct rmap_walk_control *rwc, bool locked)
2867	{
2868	pgoff_t pgoff_end = pgoff_start + nr_pages - `1`;
2869	struct vm_area_struct *vma;
2870
2871	VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
2872	VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
2873	VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
2874
2875	if (!locked) {
2876	if (i_mmap_trylock_read(mapping))
2877	goto lookup;
2878
2879	if (rwc->try_lock) {
2880	rwc->contended = true;
2881	return;
2882	}
2883
2884	i_mmap_lock_read(mapping);
2885	}
2886	lookup:
2887	vma_interval_tree_foreach(vma, &mapping->i_mmap,
2888	pgoff_start, pgoff_end) {
2889	unsigned long address = vma_address(vma, pgoff: pgoff_start, nr_pages);
2890
2891	VM_BUG_ON_VMA(address == -EFAULT, vma);
2892	cond_resched();
2893
2894	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2895	continue;
2896
2897	if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2898	goto done;
2899	if (rwc->done && rwc->done(folio))
2900	goto done;
2901	}
2902	done:
2903	if (!locked)
2904	i_mmap_unlock_read(mapping);
2905	}
2906
2907	/*
2908	* rmap_walk_file - do something to file page using the object-based rmap method
2909	* @folio: the folio to be handled
2910	* @rwc: control variable according to each walk type
2911	* @locked: caller holds relevant rmap lock
2912	*
2913	* Find all the mappings of a folio using the mapping pointer and the vma chains
2914	* contained in the address_space struct it points to.
2915	*/
2916	static void rmap_walk_file(struct folio *folio,
2917	struct rmap_walk_control *rwc, bool locked)
2918	{
2919	/*
2920	* The folio lock not only makes sure that folio->mapping cannot
2921	* suddenly be NULLified by truncation, it makes sure that the structure
2922	* at mapping cannot be freed and reused yet, so we can safely take
2923	* mapping->i_mmap_rwsem.
2924	*/
2925	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2926
2927	if (!folio->mapping)
2928	return;
2929
2930	__rmap_walk_file(folio, mapping: folio->mapping, pgoff_start: folio->index,
2931	nr_pages: folio_nr_pages(folio), rwc, locked);
2932	}
2933
2934	void rmap_walk(struct folio folio, struct* rmap_walk_control *rwc)
2935	{
2936	if (unlikely(folio_test_ksm(folio)))
2937	rmap_walk_ksm(folio, rwc);
2938	else if (folio_test_anon(folio))
2939	rmap_walk_anon(folio, rwc, locked: false);
2940	else
2941	rmap_walk_file(folio, rwc, locked: false);
2942	}
2943
2944	/ Like rmap_walk, but caller holds relevant rmap lock /
2945	void rmap_walk_locked(struct folio folio, struct* rmap_walk_control *rwc)
2946	{
2947	/ no ksm support for now /
2948	VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
2949	if (folio_test_anon(folio))
2950	rmap_walk_anon(folio, rwc, locked: true);
2951	else
2952	rmap_walk_file(folio, rwc, locked: true);
2953	}
2954
2955	#ifdef CONFIG_HUGETLB_PAGE
2956	/*
2957	* The following two functions are for anonymous (private mapped) hugepages.
2958	* Unlike common anonymous pages, anonymous hugepages have no accounting code
2959	* and no lru code, because we handle hugepages differently from common pages.
2960	*/
2961	void hugetlb_add_anon_rmap(struct folio folio, struct* vm_area_struct *vma,
2962	unsigned long address, rmap_t flags)
2963	{
2964	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
2965	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2966
2967	atomic_inc(v: &folio->_entire_mapcount);
2968	atomic_inc(v: &folio->_large_mapcount);
2969	if (flags & RMAP_EXCLUSIVE)
2970	SetPageAnonExclusive(&folio->page);
2971	VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > `1` &&
2972	PageAnonExclusive(&folio->page), folio);
2973	}
2974
2975	void hugetlb_add_new_anon_rmap(struct folio *folio,
2976	struct vm_area_struct vma, unsigned* long address)
2977	{
2978	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
2979
2980	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
2981	/ increment count (starts at -1) /
2982	atomic_set(v: &folio->_entire_mapcount, i: `0`);
2983	atomic_set(v: &folio->_large_mapcount, i: `0`);
2984	folio_clear_hugetlb_restore_reserve(folio);
2985	__folio_set_anon(folio, vma, address, exclusive: true);
2986	SetPageAnonExclusive(&folio->page);
2987	}
2988	#endif /* CONFIG_HUGETLB_PAGE */
2989

source code of linux/mm/rmap.c