linux/mm/memory.c at 28dfe4317541e57fe52f9a290394cd29c348228b · OpenGamingCollective/linux · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/leafops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
#include <linux/pgalloc.h>
#include <linux/uaccess.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"
#include "swap.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);

/*
 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 * wr-protected).
 */
static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
	if (!userfaultfd_wp(vmf->vma))
		return false;
	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
		return false;

	return pte_is_uffd_wp_marker(vmf->orig_pte);
}

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif

static const struct ctl_table mmu_sysctl_table[] = {
	{
		.procname	= "randomize_va_space",
		.data		= &randomize_va_space,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
};

static int __init init_mm_sysctl(void)
{
	register_sysctl_init("kernel", mmu_sysctl_table);
	return 0;
}

subsys_initcall(init_mm_sysctl);

#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
	/*
	 * Transitioning a PTE from 'old' to 'young' can be expensive on
	 * some architectures, even if it's performed in hardware. By
	 * default, "false" means prefaulted entries will be 'young'.
	 */
	return false;
}
#endif

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
	return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
	trace_rss_stat(mm, member);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
{
	pgtable_t token = pmd_pgtable(*pmd);
	pmd_clear(pmd);
	pte_free_tlb(tlb, token, addr);
	mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
		free_pte_range(tlb, pmd, addr);
	} while (pmd++, addr = next, addr != end);

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
	pmd_free_tlb(tlb, pmd, start);
	mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	pud = pud_offset(p4d, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
	} while (pud++, addr = next, addr != end);

	start &= P4D_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= P4D_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(p4d, start);
	p4d_clear(p4d);
	pud_free_tlb(tlb, pud, start);
	mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
{
	p4d_t *p4d;
	unsigned long next;
	unsigned long start;

	start = addr;
	p4d = p4d_offset(pgd, addr);
	do {
		next = p4d_addr_end(addr, end);
		if (p4d_none_or_clear_bad(p4d))
			continue;
		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
	} while (p4d++, addr = next, addr != end);

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	p4d = p4d_offset(pgd, start);
	pgd_clear(pgd);
	p4d_free_tlb(tlb, p4d, start);
}

/**
 * free_pgd_range - Unmap and free page tables in the range
 * @tlb: the mmu_gather containing pending TLB flush info
 * @addr: virtual address start
 * @end: virtual address end
 * @floor: lowest address boundary
 * @ceiling: highest address boundary
 *
 * This function tears down all user-level page tables in the
 * specified virtual address range [@addr..@end). It is part of
 * the memory unmap flow.
 */
void free_pgd_range(struct mmu_gather *tlb,
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */

	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;
	/*
	 * We add page table cache pages with PAGE_SIZE,
	 * (see pte_free_tlb()), flush the tlb if we need
	 */
	tlb_change_page_size(tlb, PAGE_SIZE);
	pgd = pgd_offset(tlb->mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
	} while (pgd++, addr = next, addr != end);
}

/**
 * free_pgtables() - Free a range of page tables
 * @tlb: The mmu gather
 * @unmap: The unmap_desc
 *
 * Note: pg_start and pg_end are provided to indicate the absolute range of the
 * page tables that should be removed.  This can differ from the vma mappings on
 * some archs that may have mappings that need to be removed outside the vmas.
 * Note that the prev->vm_end and next->vm_start are often used.
 *
 * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
 * unrelated data to the mm_struct being torn down.
 */
void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
	struct unlink_vma_file_batch vb;
	struct ma_state *mas = unmap->mas;
	struct vm_area_struct *vma = unmap->first;

	/*
	 * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
	 * may be 0.  Underflow is expected in this case.  Otherwise the
	 * pagetable end is exclusive.  vma_end is exclusive.  The last vma
	 * address should never be larger than the pagetable end.
	 */
	WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1);

	tlb_free_vmas(tlb);

	do {
		unsigned long addr = vma->vm_start;
		struct vm_area_struct *next;

		next = mas_find(mas, unmap->tree_end - 1);

		/*
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
		 */
		if (unmap->mm_wr_locked)
			vma_start_write(vma);
		unlink_anon_vmas(vma);

		unlink_file_vma_batch_init(&vb);
		unlink_file_vma_batch_add(&vb, vma);

		/*
		 * Optimization: gather nearby vmas into one call down
		 */
		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
			vma = next;
			next = mas_find(mas, unmap->tree_end - 1);
			if (unmap->mm_wr_locked)
				vma_start_write(vma);
			unlink_anon_vmas(vma);
			unlink_file_vma_batch_add(&vb, vma);
		}
		unlink_file_vma_batch_final(&vb);

		free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start,
			       next ? next->vm_start : unmap->pg_end);
		vma = next;
	} while (vma);
}

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
	spinlock_t *ptl = pmd_lock(mm, pmd);

	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
		mm_inc_nr_ptes(mm);
		/*
		 * Ensure all pte setup (eg. pte page lock and page clearing) are
		 * visible before the pte is made visible to other CPUs by being
		 * put into page tables.
		 *
		 * The other side of the story is the pointer chasing in the page
		 * table walking code (when walking the page table without locking;
		 * ie. most of the time). Fortunately, these data accesses consist
		 * of a chain of data-dependent loads, meaning most CPUs (alpha
		 * being the notable exception) will already guarantee loads are
		 * seen in-order. See the alpha page table accessors for the
		 * smp_rmb() barriers in page table walking code.
		 */
		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
		pmd_populate(mm, pmd, *pte);
		*pte = NULL;
	}
	spin_unlock(ptl);
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
	pgtable_t new = pte_alloc_one(mm);
	if (!new)
		return -ENOMEM;

	pmd_install(mm, pmd, &new);
	if (new)
		pte_free(mm, new);
	return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
	pte_t *new = pte_alloc_one_kernel(&init_mm);
	if (!new)
		return -ENOMEM;

	spin_lock(&init_mm.page_table_lock);
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
		smp_wmb(); /* See comment in pmd_install() */
		pmd_populate_kernel(&init_mm, pmd, new);
		new = NULL;
	}
	spin_unlock(&init_mm.page_table_lock);
	if (new)
		pte_free_kernel(&init_mm, new);
	return 0;
}

static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
}

static bool is_bad_page_map_ratelimited(void)
{
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return true;
		}
		if (nr_unshown) {
			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
				 nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
	return false;
}

static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
{
	unsigned long long pgdv, p4dv, pudv, pmdv;
	p4d_t p4d, *p4dp;
	pud_t pud, *pudp;
	pmd_t pmd, *pmdp;
	pgd_t *pgdp;

	/*
	 * Although this looks like a fully lockless pgtable walk, it is not:
	 * see locking requirements for print_bad_page_map().
	 */
	pgdp = pgd_offset(mm, addr);
	pgdv = pgd_val(*pgdp);

	if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
		pr_alert("pgd:%08llx\n", pgdv);
		return;
	}

	p4dp = p4d_offset(pgdp, addr);
	p4d = p4dp_get(p4dp);
	p4dv = p4d_val(p4d);

	if (!p4d_present(p4d) || p4d_leaf(p4d)) {
		pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
		return;
	}

	pudp = pud_offset(p4dp, addr);
	pud = pudp_get(pudp);
	pudv = pud_val(pud);

	if (!pud_present(pud) || pud_leaf(pud)) {
		pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
		return;
	}

	pmdp = pmd_offset(pudp, addr);
	pmd = pmdp_get(pmdp);
	pmdv = pmd_val(pmd);

	/*
	 * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
	 * because the table should already be mapped by the caller and
	 * doing another map would be bad. print_bad_page_map() should
	 * already take care of printing the PTE.
	 */
	pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
		 p4dv, pudv, pmdv);
}

/*
 * This function is called to print an error when a bad page table entry (e.g.,
 * corrupted page table entry) is found. For example, we might have a
 * PFN-mapped pte in a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 *
 * This function must be called during a proper page table walk, as it will
 * re-walk the page table to dump information: the caller MUST prevent page
 * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
 * page table lock.
 */
static void print_bad_page_map(struct vm_area_struct *vma,
		unsigned long addr, unsigned long long entry, struct page *page,
		enum pgtable_level level)
{
	struct address_space *mapping;
	pgoff_t index;

	if (is_bad_page_map_ratelimited())
		return;

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

	pr_alert("BUG: Bad page map in process %s  %s:%08llx", current->comm,
		 pgtable_level_to_str(level), entry);
	__print_bad_page_map_pgtable(vma->vm_mm, addr);
	if (page)
		dump_page(page, "bad page map");
	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
	pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
		 vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->fault : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
		 mapping ? mapping->a_ops->read_folio : NULL);
	dump_stack();
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
#define print_bad_pte(vma, addr, pte, page) \
	print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)

/**
 * __vm_normal_page() - Get the "struct page" associated with a page table entry.
 * @vma: The VMA mapping the page table entry.
 * @addr: The address where the page table entry is mapped.
 * @pfn: The PFN stored in the page table entry.
 * @special: Whether the page table entry is marked "special".
 * @level: The page table level for error reporting purposes only.
 * @entry: The page table entry value for error reporting purposes only.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page and
 * are ordinarily refcounted.
 *
 * Page mappings of the shared zero folios are always considered "special", as
 * they are not ordinarily refcounted: neither the refcount nor the mapcount
 * of these folios is adjusted when mapping them into user page tables.
 * Selected page table walkers (such as GUP) can still identify mappings of the
 * shared zero folios and work with the underlying "struct page".
 *
 * There are 2 broad cases. Firstly, an architecture may define a "special"
 * page table entry bit, such as pte_special(), in which case this function is
 * trivial. Secondly, an architecture may not have a spare page table
 * entry bit, which requires a more complicated scheme, described below.
 *
 * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
 * page table entries that actually map "normal" pages: however, that page
 * cannot be looked up through the PFN stored in the page table entry, but
 * instead will be looked up through vm_ops->find_normal_page(). So far, this
 * only applies to PTEs.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true, except the shared zero
 * folios) are refcounted and considered normal pages by the VM.
 *
 * The disadvantage is that pages are refcounted (which can be slower and
 * simply not an option for some PFNMAP users). The advantage is that we
 * don't have to follow the strict linearity rule of PFNMAP mappings in
 * order to support COWable mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *	   NULL if this is a "special" mapping.
 */
static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
		unsigned long addr, unsigned long pfn, bool special,
		unsigned long long entry, enum pgtable_level level)
{
	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
		if (unlikely(special)) {
#ifdef CONFIG_FIND_NORMAL_PAGE
			if (vma->vm_ops && vma->vm_ops->find_normal_page)
				return vma->vm_ops->find_normal_page(vma, addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
			if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
				return NULL;
			if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
				return NULL;

			print_bad_page_map(vma, addr, entry, NULL, level);
			return NULL;
		}
		/*
		 * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
		 * mappings (incl. shared zero folios) are marked accordingly.
		 */
	} else {
		if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
			if (vma->vm_flags & VM_MIXEDMAP) {
				/* If it has a "struct page", it's "normal". */
				if (!pfn_valid(pfn))
					return NULL;
			} else {
				unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;

				/* Only CoW'ed anon folios are "normal". */
				if (pfn == vma->vm_pgoff + off)
					return NULL;
				if (!is_cow_mapping(vma->vm_flags))
					return NULL;
			}
		}

		if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
			return NULL;
	}

	if (unlikely(pfn > highest_memmap_pfn)) {
		/* Corrupted page table entry. */
		print_bad_page_map(vma, addr, entry, NULL, level);
		return NULL;
	}
	/*
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * For example, VDSO mappings can cause them to exist.
	 */
	VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
	return pfn_to_page(pfn);
}

/**
 * vm_normal_page() - Get the "struct page" associated with a PTE
 * @vma: The VMA mapping the @pte.
 * @addr: The address where the @pte is mapped.
 * @pte: The PTE.
 *
 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *	   NULL if this is a "special" mapping.
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
			    pte_t pte)
{
	return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
				pte_val(pte), PGTABLE_LEVEL_PTE);
}

/**
 * vm_normal_folio() - Get the "struct folio" associated with a PTE
 * @vma: The VMA mapping the @pte.
 * @addr: The address where the @pte is mapped.
 * @pte: The PTE.
 *
 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 *	   NULL if this is a "special" mapping.
 */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
			    pte_t pte)
{
	struct page *page = vm_normal_page(vma, addr, pte);

	if (page)
		return page_folio(page);
	return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
/**
 * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
 * @vma: The VMA mapping the @pmd.
 * @addr: The address where the @pmd is mapped.
 * @pmd: The PMD.
 *
 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *	   NULL if this is a "special" mapping.
 */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
				pmd_t pmd)
{
	return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
				pmd_val(pmd), PGTABLE_LEVEL_PMD);
}

/**
 * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
 * @vma: The VMA mapping the @pmd.
 * @addr: The address where the @pmd is mapped.
 * @pmd: The PMD.
 *
 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 *	   NULL if this is a "special" mapping.
 */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
				  unsigned long addr, pmd_t pmd)
{
	struct page *page = vm_normal_page_pmd(vma, addr, pmd);

	if (page)
		return page_folio(page);
	return NULL;
}

/**
 * vm_normal_page_pud() - Get the "struct page" associated with a PUD
 * @vma: The VMA mapping the @pud.
 * @addr: The address where the @pud is mapped.
 * @pud: The PUD.
 *
 * Get the "struct page" associated with a PUD. See __vm_normal_page()
 * for details on "normal" and "special" mappings.
 *
 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 *	   NULL if this is a "special" mapping.
 */
struct page *vm_normal_page_pud(struct vm_area_struct *vma,
		unsigned long addr, pud_t pud)
{
	return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
				pud_val(pud), PGTABLE_LEVEL_PUD);
}
#endif

/**
 * restore_exclusive_pte - Restore a device-exclusive entry
 * @vma: VMA covering @address
 * @folio: the mapped folio
 * @page: the mapped folio page
 * @address: the virtual address
 * @ptep: pte pointer into the locked page table mapping the folio page
 * @orig_pte: pte value at @ptep
 *
 * Restore a device-exclusive non-swap entry to an ordinary present pte.
 *
 * The folio and the page table must be locked, and MMU notifiers must have
 * been called to invalidate any (exclusive) device mappings.
 *
 * Locking the folio makes sure that anybody who just converted the pte to
 * a device-exclusive entry can map it into the device to make forward
 * progress without others converting it back until the folio was unlocked.
 *
 * If the folio lock ever becomes an issue, we can stop relying on the folio
 * lock; it might make some scenarios with heavy thrashing less likely to
 * make forward progress, but these scenarios might not be valid use cases.
 *
 * Note that the folio lock does not protect against all cases of concurrent
 * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
 * must use MMU notifiers to sync against any concurrent changes.
 */
static void restore_exclusive_pte(struct vm_area_struct *vma,
		struct folio *folio, struct page *page, unsigned long address,
		pte_t *ptep, pte_t orig_pte)
{
	pte_t pte;

	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
	if (pte_swp_soft_dirty(orig_pte))
		pte = pte_mksoft_dirty(pte);

	if (pte_swp_uffd_wp(orig_pte))
		pte = pte_mkuffd_wp(pte);

	if ((vma->vm_flags & VM_WRITE) &&
	    can_change_pte_writable(vma, address, pte)) {
		if (folio_test_dirty(folio))
			pte = pte_mkdirty(pte);
		pte = pte_mkwrite(pte, vma);
	}
	set_pte_at(vma->vm_mm, address, ptep, pte);

	/*
	 * No need to invalidate - it was non-present before. However
	 * secondary CPUs may have mappings that need invalidating.
	 */
	update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int try_restore_exclusive_pte(struct vm_area_struct *vma,
		unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
	const softleaf_t entry = softleaf_from_pte(orig_pte);
	struct page *page = softleaf_to_page(entry);
	struct folio *folio = page_folio(page);

	if (folio_trylock(folio)) {
		restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
		folio_unlock(folio);
		return 0;
	}

	return -EBUSY;
}

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
	vm_flags_t vm_flags = dst_vma->vm_flags;
	pte_t orig_pte = ptep_get(src_pte);
	softleaf_t entry = softleaf_from_pte(orig_pte);
	pte_t pte = orig_pte;
	struct folio *folio;
	struct page *page;

	if (likely(softleaf_is_swap(entry))) {
		if (swap_dup_entry_direct(entry) < 0)
			return -EIO;

		/* make sure dst_mm is on swapoff's mmlist. */
		if (unlikely(list_empty(&dst_mm->mmlist))) {
			spin_lock(&mmlist_lock);
			if (list_empty(&dst_mm->mmlist))
				list_add(&dst_mm->mmlist,
						&src_mm->mmlist);
			spin_unlock(&mmlist_lock);
		}
		/* Mark the swap entry as shared. */
		if (pte_swp_exclusive(orig_pte)) {
			pte = pte_swp_clear_exclusive(orig_pte);
			set_pte_at(src_mm, addr, src_pte, pte);
		}
		rss[MM_SWAPENTS]++;
	} else if (softleaf_is_migration(entry)) {
		folio = softleaf_to_folio(entry);

		rss[mm_counter(folio)]++;

		if (!softleaf_is_migration_read(entry) &&
				is_cow_mapping(vm_flags)) {
			/*
			 * COW mappings require pages in both parent and child
			 * to be set to read. A previously exclusive entry is
			 * now shared.
			 */
			entry = make_readable_migration_entry(
							swp_offset(entry));
			pte = softleaf_to_pte(entry);
			if (pte_swp_soft_dirty(orig_pte))
				pte = pte_swp_mksoft_dirty(pte);
			if (pte_swp_uffd_wp(orig_pte))
				pte = pte_swp_mkuffd_wp(pte);
			set_pte_at(src_mm, addr, src_pte, pte);
		}
	} else if (softleaf_is_device_private(entry)) {
		page = softleaf_to_page(entry);
		folio = page_folio(page);

		/*
		 * Update rss count even for unaddressable pages, as
		 * they should treated just like normal pages in this
		 * respect.
		 *
		 * We will likely want to have some new rss counters
		 * for unaddressable pages, at some point. But for now
		 * keep things as they are.
		 */