Skip to content

Commit 576ee5d

Browse files
committed
fs: add immutable rootfs
Currently pivot_root() doesn't work on the real rootfs because it cannot be unmounted. Userspace has to do a recursive removal of the initramfs contents manually before continuing the boot. Really all we want from the real rootfs is to serve as the parent mount for anything that is actually useful such as the tmpfs or ramfs for initramfs unpacking or the rootfs itself. There's no need for the real rootfs to actually be anything meaningful or useful. Add a immutable rootfs called "nullfs" that can be selected via the "nullfs_rootfs" kernel command line option. The kernel will mount a tmpfs/ramfs on top of it, unpack the initramfs and fire up userspace which mounts the rootfs and can then just do: chdir(rootfs); pivot_root(".", "."); umount2(".", MNT_DETACH); and be done with it. (Ofc, userspace can also choose to retain the initramfs contents by using something like pivot_root(".", "/initramfs") without unmounting it.) Technically this also means that the rootfs mount in unprivileged namespaces doesn't need to become MNT_LOCKED anymore as it's guaranteed that the immutable rootfs remains permanently empty so there cannot be anything revealed by unmounting the covering mount. In the future this will also allow us to create completely empty mount namespaces without risking to leak anything. systemd already handles this all correctly as it tries to pivot_root() first and falls back to MS_MOVE only when that fails. This goes back to various discussion in previous years and a LPC 2024 presentation about this very topic. Link: https://patch.msgid.link/20260112-work-immutable-rootfs-v2-3-88dd1c34a204@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 3c1b73f commit 576ee5d

File tree

7 files changed

+159
-12
lines changed

7 files changed

+159
-12
lines changed

fs/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \
1616
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
1717
fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
1818
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
19-
file_attr.o
19+
file_attr.o nullfs.o
2020

2121
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
2222
obj-$(CONFIG_PROC_FS) += proc_namespace.o

fs/mount.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <linux/ns_common.h>
66
#include <linux/fs_pin.h>
77

8+
extern struct file_system_type nullfs_fs_type;
89
extern struct list_head notify_list;
910

1011
struct mnt_namespace {

fs/namespace.c

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,17 @@ static int __init initramfs_options_setup(char *str)
7575

7676
__setup("initramfs_options=", initramfs_options_setup);
7777

78+
bool nullfs_rootfs = false;
79+
80+
static int __init nullfs_rootfs_setup(char *str)
81+
{
82+
if (*str)
83+
return 0;
84+
nullfs_rootfs = true;
85+
return 1;
86+
}
87+
__setup("nullfs_rootfs", nullfs_rootfs_setup);
88+
7889
static u64 event;
7990
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
8091
static DEFINE_IDA(mnt_group_ida);
@@ -4582,8 +4593,9 @@ int path_pivot_root(struct path *new, struct path *old)
45824593
* pointed to by put_old must yield the same directory as new_root. No other
45834594
* file system may be mounted on put_old. After all, new_root is a mountpoint.
45844595
*
4585-
* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4586-
* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4596+
* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem
4597+
* unless the kernel was booted with "nullfs_rootfs". See
4598+
* Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
45874599
* in this situation.
45884600
*
45894601
* Notes:
@@ -5976,24 +5988,72 @@ struct mnt_namespace init_mnt_ns = {
59765988

59775989
static void __init init_mount_tree(void)
59785990
{
5979-
struct vfsmount *mnt;
5980-
struct mount *m;
5991+
struct vfsmount *mnt, *nullfs_mnt;
5992+
struct mount *mnt_root;
59815993
struct path root;
59825994

5995+
/*
5996+
* When nullfs is used, we create two mounts:
5997+
*
5998+
* (1) nullfs with mount id 1
5999+
* (2) mutable rootfs with mount id 2
6000+
*
6001+
* with (2) mounted on top of (1).
6002+
*/
6003+
if (nullfs_rootfs) {
6004+
nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
6005+
if (IS_ERR(nullfs_mnt))
6006+
panic("VFS: Failed to create nullfs");
6007+
}
6008+
59836009
mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
59846010
if (IS_ERR(mnt))
59856011
panic("Can't create rootfs");
59866012

5987-
m = real_mount(mnt);
5988-
init_mnt_ns.root = m;
5989-
init_mnt_ns.nr_mounts = 1;
5990-
mnt_add_to_ns(&init_mnt_ns, m);
6013+
if (nullfs_rootfs) {
6014+
VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
6015+
VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
6016+
6017+
/* The namespace root is the nullfs mnt. */
6018+
mnt_root = real_mount(nullfs_mnt);
6019+
init_mnt_ns.root = mnt_root;
6020+
6021+
/* Mount mutable rootfs on top of nullfs. */
6022+
root.mnt = nullfs_mnt;
6023+
root.dentry = nullfs_mnt->mnt_root;
6024+
6025+
LOCK_MOUNT_EXACT(mp, &root);
6026+
if (unlikely(IS_ERR(mp.parent)))
6027+
panic("VFS: Failed to mount rootfs on nullfs");
6028+
scoped_guard(mount_writer)
6029+
attach_mnt(real_mount(mnt), mp.parent, mp.mp);
6030+
6031+
pr_info("VFS: Finished mounting rootfs on nullfs\n");
6032+
} else {
6033+
VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 1);
6034+
6035+
/* The namespace root is the mutable rootfs. */
6036+
mnt_root = real_mount(mnt);
6037+
init_mnt_ns.root = mnt_root;
6038+
}
6039+
6040+
/*
6041+
* We've dropped all locks here but that's fine. Not just are we
6042+
* the only task that's running, there's no other mount
6043+
* namespace in existence and the initial mount namespace is
6044+
* completely empty until we add the mounts we just created.
6045+
*/
6046+
for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
6047+
mnt_add_to_ns(&init_mnt_ns, p);
6048+
init_mnt_ns.nr_mounts++;
6049+
}
6050+
59916051
init_task.nsproxy->mnt_ns = &init_mnt_ns;
59926052
get_mnt_ns(&init_mnt_ns);
59936053

5994-
root.mnt = mnt;
5995-
root.dentry = mnt->mnt_root;
5996-
6054+
/* The root and pwd always point to the mutable rootfs. */
6055+
root.mnt = mnt;
6056+
root.dentry = mnt->mnt_root;
59976057
set_fs_pwd(current->fs, &root);
59986058
set_fs_root(current->fs, &root);
59996059

fs/nullfs.c

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
3+
#include <linux/fs/super_types.h>
4+
#include <linux/fs_context.h>
5+
#include <linux/magic.h>
6+
7+
static const struct super_operations nullfs_super_operations = {
8+
.statfs = simple_statfs,
9+
};
10+
11+
static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc)
12+
{
13+
struct inode *inode;
14+
15+
s->s_maxbytes = MAX_LFS_FILESIZE;
16+
s->s_blocksize = PAGE_SIZE;
17+
s->s_blocksize_bits = PAGE_SHIFT;
18+
s->s_magic = NULL_FS_MAGIC;
19+
s->s_op = &nullfs_super_operations;
20+
s->s_export_op = NULL;
21+
s->s_xattr = NULL;
22+
s->s_time_gran = 1;
23+
s->s_d_flags = 0;
24+
25+
inode = new_inode(s);
26+
if (!inode)
27+
return -ENOMEM;
28+
29+
/* nullfs is permanently empty... */
30+
make_empty_dir_inode(inode);
31+
simple_inode_init_ts(inode);
32+
inode->i_ino = 1;
33+
/* ... and immutable. */
34+
inode->i_flags |= S_IMMUTABLE;
35+
36+
s->s_root = d_make_root(inode);
37+
if (!s->s_root)
38+
return -ENOMEM;
39+
40+
return 0;
41+
}
42+
43+
/*
44+
* For now this is a single global instance. If needed we can make it
45+
* mountable by userspace at which point we will need to make it
46+
* multi-instance.
47+
*/
48+
static int nullfs_fs_get_tree(struct fs_context *fc)
49+
{
50+
return get_tree_single(fc, nullfs_fs_fill_super);
51+
}
52+
53+
static const struct fs_context_operations nullfs_fs_context_ops = {
54+
.get_tree = nullfs_fs_get_tree,
55+
};
56+
57+
static int nullfs_init_fs_context(struct fs_context *fc)
58+
{
59+
fc->ops = &nullfs_fs_context_ops;
60+
fc->global = true;
61+
fc->sb_flags = SB_NOUSER;
62+
fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV;
63+
return 0;
64+
}
65+
66+
struct file_system_type nullfs_fs_type = {
67+
.name = "nullfs",
68+
.init_fs_context = nullfs_init_fs_context,
69+
.kill_sb = kill_anon_super,
70+
};

include/uapi/linux/magic.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,6 @@
104104
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
105105
#define PID_FS_MAGIC 0x50494446 /* "PIDF" */
106106
#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */
107+
#define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */
107108

108109
#endif /* __LINUX_MAGIC_H__ */

init/do_mounts.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,20 @@ void __init prepare_namespace(void)
492492
mount_root(saved_root_name);
493493
out:
494494
devtmpfs_mount();
495+
496+
if (nullfs_rootfs) {
497+
if (init_pivot_root(".", ".")) {
498+
pr_err("VFS: Failed to pivot into new rootfs\n");
499+
return;
500+
}
501+
if (init_umount(".", MNT_DETACH)) {
502+
pr_err("VFS: Failed to unmount old rootfs\n");
503+
return;
504+
}
505+
pr_info("VFS: Pivoted into new rootfs\n");
506+
return;
507+
}
508+
495509
init_mount(".", "/", NULL, MS_MOVE, NULL);
496510
init_chroot(".");
497511
}

init/do_mounts.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
void mount_root_generic(char *name, char *pretty_name, int flags);
1616
void mount_root(char *root_device_name);
1717
extern int root_mountflags;
18+
extern bool nullfs_rootfs;
1819

1920
static inline __init int create_dev(char *name, dev_t dev)
2021
{

0 commit comments

Comments
 (0)