Skip to content

Commit c25f2fb

Browse files
committed
Merge tag 'mm-hotfixes-stable-2026-01-20-13-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton: - A patch series from David Hildenbrand which fixes a few things related to hugetlb PMD sharing - The remainder are singletons, please see their changelogs for details * tag 'mm-hotfixes-stable-2026-01-20-13-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: mm: restore per-memcg proactive reclaim with !CONFIG_NUMA mm/kfence: fix potential deadlock in reboot notifier Docs/mm/allocation-profiling: describe sysctrl limitations in debug mode mm: do not copy page tables unnecessarily for VM_UFFD_WP mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather mm/rmap: fix two comments related to huge_pmd_unshare() mm/hugetlb: fix two comments related to huge_pmd_unshare() mm/hugetlb: fix hugetlb_pmd_shared() mm: remove unnecessary and incorrect mmap lock assert x86/kfence: avoid writing L1TF-vulnerable PTEs mm/vma: do not leak memory when .mmap_prepare swaps the file migrate: correct lock ordering for hugetlb file folios panic: only warn about deprecated panic_print on write access fs/writeback: skip AS_NO_DATA_INTEGRITY mappings in wait_sb_inodes() mm: take into account mm_cid size for mm_struct static definitions mm: rename cpu_bitmap field to flexible_array mm: add missing static initializer for init_mm::mm_cid.lock
2 parents c03e9c4 + 16aca2c commit c25f2fb

File tree

22 files changed

+341
-135
lines changed

22 files changed

+341
-135
lines changed

Documentation/admin-guide/sysctl/vm.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,10 @@ memory allocations.
494494

495495
The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
496496

497+
When CONFIG_MEM_ALLOC_PROFILING_DEBUG=y, this control is read-only to avoid
498+
warnings produced by allocations made while profiling is disabled and freed
499+
when it's enabled.
500+
497501

498502
memory_failure_early_kill
499503
=========================

Documentation/mm/allocation-profiling.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,16 @@ Boot parameter:
3333
sysctl:
3434
/proc/sys/vm/mem_profiling
3535

36+
1: Enable memory profiling.
37+
38+
0: Disable memory profiling.
39+
40+
The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
41+
42+
When CONFIG_MEM_ALLOC_PROFILING_DEBUG=y, this control is read-only to avoid
43+
warnings produced by allocations made while profiling is disabled and freed
44+
when it's enabled.
45+
3646
Runtime info:
3747
/proc/allocinfo
3848

arch/x86/include/asm/kfence.h

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,41 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect)
4242
{
4343
unsigned int level;
4444
pte_t *pte = lookup_address(addr, &level);
45+
pteval_t val;
4546

4647
if (WARN_ON(!pte || level != PG_LEVEL_4K))
4748
return false;
4849

50+
val = pte_val(*pte);
51+
52+
/*
53+
* protect requires making the page not-present. If the PTE is
54+
* already in the right state, there's nothing to do.
55+
*/
56+
if (protect != !!(val & _PAGE_PRESENT))
57+
return true;
58+
59+
/*
60+
* Otherwise, invert the entire PTE. This avoids writing out an
61+
* L1TF-vulnerable PTE (not present, without the high address bits
62+
* set).
63+
*/
64+
set_pte(pte, __pte(~val));
65+
66+
/*
67+
* If the page was protected (non-present) and we're making it
68+
* present, there is no need to flush the TLB at all.
69+
*/
70+
if (!protect)
71+
return true;
72+
4973
/*
5074
* We need to avoid IPIs, as we may get KFENCE allocations or faults
5175
* with interrupts disabled. Therefore, the below is best-effort, and
5276
* does not flush TLBs on all CPUs. We can tolerate some inaccuracy;
5377
* lazy fault handling takes care of faults after the page is PRESENT.
5478
*/
5579

56-
if (protect)
57-
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
58-
else
59-
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
60-
6180
/*
6281
* Flush this CPU's TLB, assuming whoever did the allocation/free is
6382
* likely to continue running on this CPU.

drivers/firmware/efi/efi.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,10 @@ struct mm_struct efi_mm = {
7474
.page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
7575
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
7676
.user_ns = &init_user_ns,
77-
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
7877
#ifdef CONFIG_SCHED_MM_CID
7978
.mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock),
8079
#endif
80+
.flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT,
8181
};
8282

8383
struct workqueue_struct *efi_rts_wq;

fs/fs-writeback.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2750,8 +2750,13 @@ static void wait_sb_inodes(struct super_block *sb)
27502750
* The mapping can appear untagged while still on-list since we
27512751
* do not have the mapping lock. Skip it here, wb completion
27522752
* will remove it.
2753+
*
2754+
* If the mapping does not have data integrity semantics,
2755+
* there's no need to wait for the writeout to complete, as the
2756+
* mapping cannot guarantee that data is persistently stored.
27532757
*/
2754-
if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2758+
if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) ||
2759+
mapping_no_data_integrity(mapping))
27552760
continue;
27562761

27572762
spin_unlock_irq(&sb->s_inode_wblist_lock);

fs/fuse/file.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3200,8 +3200,10 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
32003200

32013201
inode->i_fop = &fuse_file_operations;
32023202
inode->i_data.a_ops = &fuse_file_aops;
3203-
if (fc->writeback_cache)
3203+
if (fc->writeback_cache) {
32043204
mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data);
3205+
mapping_set_no_data_integrity(&inode->i_data);
3206+
}
32053207

32063208
INIT_LIST_HEAD(&fi->write_files);
32073209
INIT_LIST_HEAD(&fi->queued_writes);

include/asm-generic/tlb.h

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
*
4747
* The mmu_gather API consists of:
4848
*
49-
* - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu()
49+
* - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() /
50+
* tlb_finish_mmu()
5051
*
5152
* start and finish a mmu_gather
5253
*
@@ -364,6 +365,20 @@ struct mmu_gather {
364365
unsigned int vma_huge : 1;
365366
unsigned int vma_pfn : 1;
366367

368+
/*
369+
* Did we unshare (unmap) any shared page tables? For now only
370+
* used for hugetlb PMD table sharing.
371+
*/
372+
unsigned int unshared_tables : 1;
373+
374+
/*
375+
* Did we unshare any page tables such that they are now exclusive
376+
* and could get reused+modified by the new owner? When setting this
377+
* flag, "unshared_tables" will be set as well. For now only used
378+
* for hugetlb PMD table sharing.
379+
*/
380+
unsigned int fully_unshared_tables : 1;
381+
367382
unsigned int batch_count;
368383

369384
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +415,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
400415
tlb->cleared_pmds = 0;
401416
tlb->cleared_puds = 0;
402417
tlb->cleared_p4ds = 0;
418+
tlb->unshared_tables = 0;
403419
/*
404420
* Do not reset mmu_gather::vma_* fields here, we do not
405421
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +500,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
484500
* these bits.
485501
*/
486502
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
487-
tlb->cleared_puds || tlb->cleared_p4ds))
503+
tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
488504
return;
489505

490506
tlb_flush(tlb);
@@ -773,6 +789,63 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
773789
}
774790
#endif
775791

792+
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
793+
static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
794+
unsigned long addr)
795+
{
796+
/*
797+
* The caller must make sure that concurrent unsharing + exclusive
798+
* reuse is impossible until tlb_flush_unshared_tables() was called.
799+
*/
800+
VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
801+
ptdesc_pmd_pts_dec(pt);
802+
803+
/* Clearing a PUD pointing at a PMD table with PMD leaves. */
804+
tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
805+
806+
/*
807+
* If the page table is now exclusively owned, we fully unshared
808+
* a page table.
809+
*/
810+
if (!ptdesc_pmd_is_shared(pt))
811+
tlb->fully_unshared_tables = true;
812+
tlb->unshared_tables = true;
813+
}
814+
815+
static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
816+
{
817+
/*
818+
* As soon as the caller drops locks to allow for reuse of
819+
* previously-shared tables, these tables could get modified and
820+
* even reused outside of hugetlb context, so we have to make sure that
821+
* any page table walkers (incl. TLB, GUP-fast) are aware of that
822+
* change.
823+
*
824+
* Even if we are not fully unsharing a PMD table, we must
825+
* flush the TLB for the unsharer now.
826+
*/
827+
if (tlb->unshared_tables)
828+
tlb_flush_mmu_tlbonly(tlb);
829+
830+
/*
831+
* Similarly, we must make sure that concurrent GUP-fast will not
832+
* walk previously-shared page tables that are getting modified+reused
833+
* elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
834+
*
835+
* We only perform this when we are the last sharer of a page table,
836+
* as the IPI will reach all CPUs: any GUP-fast.
837+
*
838+
* Note that on configs where tlb_remove_table_sync_one() is a NOP,
839+
* the expectation is that the tlb_flush_mmu_tlbonly() would have issued
840+
* required IPIs already for us.
841+
*/
842+
if (tlb->fully_unshared_tables) {
843+
tlb_remove_table_sync_one();
844+
tlb->fully_unshared_tables = false;
845+
}
846+
}
847+
#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
848+
776849
#endif /* CONFIG_MMU */
777850

778851
#endif /* _ASM_GENERIC__TLB_H */

include/linux/hugetlb.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
240240
pte_t *huge_pte_offset(struct mm_struct *mm,
241241
unsigned long addr, unsigned long sz);
242242
unsigned long hugetlb_mask_last_page(struct hstate *h);
243-
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
244-
unsigned long addr, pte_t *ptep);
243+
int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
244+
unsigned long addr, pte_t *ptep);
245+
void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
245246
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
246247
unsigned long *start, unsigned long *end);
247248

@@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write(
300301
return NULL;
301302
}
302303

303-
static inline int huge_pmd_unshare(struct mm_struct *mm,
304-
struct vm_area_struct *vma,
305-
unsigned long addr, pte_t *ptep)
304+
static inline int huge_pmd_unshare(struct mmu_gather *tlb,
305+
struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
306306
{
307307
return 0;
308308
}
309309

310+
static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
311+
struct vm_area_struct *vma)
312+
{
313+
}
314+
310315
static inline void adjust_range_if_pmd_sharing_possible(
311316
struct vm_area_struct *vma,
312317
unsigned long *start, unsigned long *end)
@@ -1326,7 +1331,7 @@ static inline __init void hugetlb_cma_reserve(int order)
13261331
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
13271332
static inline bool hugetlb_pmd_shared(pte_t *pte)
13281333
{
1329-
return page_count(virt_to_page(pte)) > 1;
1334+
return ptdesc_pmd_is_shared(virt_to_ptdesc(pte));
13301335
}
13311336
#else
13321337
static inline bool hugetlb_pmd_shared(pte_t *pte)

include/linux/mm.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,11 @@ enum {
608608
/*
609609
* Flags which should result in page tables being copied on fork. These are
610610
* flags which indicate that the VMA maps page tables which cannot be
611-
* reconsistuted upon page fault, so necessitate page table copying upon
611+
* reconsistuted upon page fault, so necessitate page table copying upon fork.
612+
*
613+
* Note that these flags should be compared with the DESTINATION VMA not the
614+
* source, as VM_UFFD_WP may not be propagated to destination, while all other
615+
* flags will be.
612616
*
613617
* VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
614618
* reasonably reconstructed on page fault.

include/linux/mm_types.h

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,7 +1329,7 @@ struct mm_struct {
13291329
* The mm_cpumask needs to be at the end of mm_struct, because it
13301330
* is dynamically sized based on nr_cpu_ids.
13311331
*/
1332-
unsigned long cpu_bitmap[];
1332+
char flexible_array[] __aligned(__alignof__(unsigned long));
13331333
};
13341334

13351335
/* Copy value to the first system word of mm flags, non-atomically. */
@@ -1366,19 +1366,24 @@ static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm,
13661366
MT_FLAGS_USE_RCU)
13671367
extern struct mm_struct init_mm;
13681368

1369+
#define MM_STRUCT_FLEXIBLE_ARRAY_INIT \
1370+
{ \
1371+
[0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0 \
1372+
}
1373+
13691374
/* Pointer magic because the dynamic array size confuses some compilers. */
13701375
static inline void mm_init_cpumask(struct mm_struct *mm)
13711376
{
13721377
unsigned long cpu_bitmap = (unsigned long)mm;
13731378

1374-
cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
1379+
cpu_bitmap += offsetof(struct mm_struct, flexible_array);
13751380
cpumask_clear((struct cpumask *)cpu_bitmap);
13761381
}
13771382

13781383
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
13791384
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
13801385
{
1381-
return (struct cpumask *)&mm->cpu_bitmap;
1386+
return (struct cpumask *)&mm->flexible_array;
13821387
}
13831388

13841389
#ifdef CONFIG_LRU_GEN
@@ -1469,7 +1474,7 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
14691474
{
14701475
unsigned long bitmap = (unsigned long)mm;
14711476

1472-
bitmap += offsetof(struct mm_struct, cpu_bitmap);
1477+
bitmap += offsetof(struct mm_struct, flexible_array);
14731478
/* Skip cpu_bitmap */
14741479
bitmap += cpumask_size();
14751480
return (struct cpumask *)bitmap;
@@ -1495,7 +1500,7 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
14951500
mm_init_cid(mm, p);
14961501
return 0;
14971502
}
1498-
#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
1503+
# define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
14991504

15001505
static inline void mm_destroy_cid(struct mm_struct *mm)
15011506
{
@@ -1509,6 +1514,8 @@ static inline unsigned int mm_cid_size(void)
15091514
return cpumask_size() + bitmap_size(num_possible_cpus());
15101515
}
15111516

1517+
/* Use 2 * NR_CPUS as worse case for static allocation. */
1518+
# define MM_CID_STATIC_SIZE (2 * sizeof(cpumask_t))
15121519
#else /* CONFIG_SCHED_MM_CID */
15131520
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
15141521
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
@@ -1517,11 +1524,13 @@ static inline unsigned int mm_cid_size(void)
15171524
{
15181525
return 0;
15191526
}
1527+
# define MM_CID_STATIC_SIZE 0
15201528
#endif /* CONFIG_SCHED_MM_CID */
15211529

15221530
struct mmu_gather;
15231531
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
15241532
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
1533+
void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma);
15251534
extern void tlb_finish_mmu(struct mmu_gather *tlb);
15261535

15271536
struct vm_fault;

0 commit comments

Comments
 (0)