Skip to content

Commit 1bce1a6

Browse files
committed
Merge patch series "mount: add OPEN_TREE_NAMESPACE"
Christian Brauner <brauner@kernel.org> says: When creating containers the setup usually involves using CLONE_NEWNS via clone3() or unshare(). This copies the caller's complete mount namespace. The runtime will also assemble a new rootfs and then use pivot_root() to switch the old mount tree with the new rootfs. Afterward it will recursively umount the old mount tree thereby getting rid of all mounts. On a basic system here where the mount table isn't particularly large this still copies about 30 mounts. Copying all of these mounts only to get rid of them later is pretty wasteful. This is exacerbated if intermediary mount namespaces are used that only exist for a very short amount of time and are immediately destroyed again causing a ton of mounts to be copied and destroyed needlessly. With a large mount table and a system where thousands or ten-thousands of namespaces are spawned in parallel this quickly becomes a bottleneck increasing contention on the semaphore. Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of returning a file descriptor referring to that mount tree OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor to a new mount namespace. In that new mount namespace the copied mount tree has been mounted on top of a copy of the real rootfs. The caller can setns() into that mount namespace and perform any additionally setup such as move_mount()ing detached mounts in there. This allows OPEN_TREE_NAMESPACE to function as a combined unshare(CLONE_NEWNS) and pivot_root(). A caller may for example choose to create an extremely minimal rootfs: fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE); This will create a mount namespace where "wootwoot" has become the rootfs mounted on top of the real rootfs. The caller can now setns() into this new mount namespace and assemble additional mounts. This also works with user namespaces: unshare(CLONE_NEWUSER); fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE); which creates a new mount namespace owned by the earlier created user namespace with "wootwoot" as the rootfs mounted on top of the real rootfs. This will scale a lot better when creating tons of mount namespaces and will allow to get rid of a lot of unnecessary mount and umount cycles. It also allows to create mount namespaces without needing to spawn throwaway helper processes. * patches from https://patch.msgid.link/20251229-work-empty-namespace-v1-0-bfb24c7b061f@kernel.org: selftests/open_tree: add OPEN_TREE_NAMESPACE tests mount: add OPEN_TREE_NAMESPACE Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-0-bfb24c7b061f@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 51a146e + b8f7622 commit 1bce1a6

File tree

9 files changed

+1231
-17
lines changed

9 files changed

+1231
-17
lines changed

fs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ extern void mnt_pin_kill(struct mount *m);
246246
*/
247247
extern const struct dentry_operations ns_dentry_operations;
248248
int open_namespace(struct ns_common *ns);
249+
struct file *open_namespace_file(struct ns_common *ns);
249250

250251
/*
251252
* fs/stat.c:

fs/namespace.c

Lines changed: 147 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
27962796
__unlock_mount(m);
27972797
}
27982798

2799+
static void lock_mount_exact(const struct path *path,
2800+
struct pinned_mountpoint *mp);
2801+
27992802
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
28002803
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
28012804
do_lock_mount((path), &mp, (beneath))
@@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path)
29462949
return check_anonymous_mnt(mnt);
29472950
}
29482951

2949-
2950-
static struct mount *__do_loopback(const struct path *old_path, int recurse)
2952+
static struct mount *__do_loopback(const struct path *old_path,
2953+
unsigned int flags, unsigned int copy_flags)
29512954
{
29522955
struct mount *old = real_mount(old_path->mnt);
2956+
bool recurse = flags & AT_RECURSIVE;
29532957

29542958
if (IS_MNT_UNBINDABLE(old))
29552959
return ERR_PTR(-EINVAL);
@@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse)
29602964
if (!recurse && __has_locked_children(old, old_path->dentry))
29612965
return ERR_PTR(-EINVAL);
29622966

2967+
/*
2968+
* When creating a new mount namespace we don't want to copy over
2969+
* mounts of mount namespaces to avoid the risk of cycles and also to
2970+
* minimize the default complex interdependencies between mount
2971+
* namespaces.
2972+
*
2973+
* We could ofc just check whether all mount namespace files aren't
2974+
* creating cycles but really let's keep this simple.
2975+
*/
2976+
if (!(flags & OPEN_TREE_NAMESPACE))
2977+
copy_flags |= CL_COPY_MNT_NS_FILE;
2978+
29632979
if (recurse)
2964-
return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2965-
else
2966-
return clone_mnt(old, old_path->dentry, 0);
2980+
return copy_tree(old, old_path->dentry, copy_flags);
2981+
2982+
return clone_mnt(old, old_path->dentry, copy_flags);
29672983
}
29682984

29692985
/*
@@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name,
29742990
{
29752991
struct path old_path __free(path_put) = {};
29762992
struct mount *mnt = NULL;
2993+
unsigned int flags = recurse ? AT_RECURSIVE : 0;
29772994
int err;
2995+
29782996
if (!old_name || !*old_name)
29792997
return -EINVAL;
29802998
err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name,
29913009
if (!check_mnt(mp.parent))
29923010
return -EINVAL;
29933011

2994-
mnt = __do_loopback(&old_path, recurse);
3012+
mnt = __do_loopback(&old_path, flags, 0);
29953013
if (IS_ERR(mnt))
29963014
return PTR_ERR(mnt);
29973015

@@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name,
30043022
return err;
30053023
}
30063024

3007-
static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
3025+
static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
30083026
{
30093027
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
30103028
struct user_namespace *user_ns = mnt_ns->user_ns;
@@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
30293047
ns->seq_origin = src_mnt_ns->ns.ns_id;
30303048
}
30313049

3032-
mnt = __do_loopback(path, recursive);
3050+
mnt = __do_loopback(path, flags, 0);
30333051
if (IS_ERR(mnt)) {
30343052
emptied_ns = ns;
30353053
return ERR_CAST(mnt);
@@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
30433061
return ns;
30443062
}
30453063

3046-
static struct file *open_detached_copy(struct path *path, bool recursive)
3064+
static struct file *open_detached_copy(struct path *path, unsigned int flags)
30473065
{
3048-
struct mnt_namespace *ns = get_detached_copy(path, recursive);
3066+
struct mnt_namespace *ns = get_detached_copy(path, flags);
30493067
struct file *file;
30503068

30513069
if (IS_ERR(ns))
@@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
30613079
return file;
30623080
}
30633081

3082+
DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
3083+
if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
3084+
3085+
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
3086+
{
3087+
struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
3088+
struct path to_path __free(path_put) = {};
3089+
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
3090+
struct user_namespace *user_ns = current_user_ns();
3091+
struct mount *new_ns_root;
3092+
struct mount *mnt;
3093+
unsigned int copy_flags = 0;
3094+
bool locked = false;
3095+
3096+
if (user_ns != ns->user_ns)
3097+
copy_flags |= CL_SLAVE;
3098+
3099+
new_ns = alloc_mnt_ns(user_ns, false);
3100+
if (IS_ERR(new_ns))
3101+
return ERR_CAST(new_ns);
3102+
3103+
scoped_guard(namespace_excl) {
3104+
new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
3105+
if (IS_ERR(new_ns_root))
3106+
return ERR_CAST(new_ns_root);
3107+
3108+
/*
3109+
* If the real rootfs had a locked mount on top of it somewhere
3110+
* in the stack, lock the new mount tree as well so it can't be
3111+
* exposed.
3112+
*/
3113+
mnt = ns->root;
3114+
while (mnt->overmount) {
3115+
mnt = mnt->overmount;
3116+
if (mnt->mnt.mnt_flags & MNT_LOCKED)
3117+
locked = true;
3118+
}
3119+
}
3120+
3121+
/*
3122+
* We dropped the namespace semaphore so we can actually lock
3123+
* the copy for mounting. The copied mount isn't attached to any
3124+
* mount namespace and it is thus excluded from any propagation.
3125+
* So realistically we're isolated and the mount can't be
3126+
* overmounted.
3127+
*/
3128+
3129+
/* Borrow the reference from clone_mnt(). */
3130+
to_path.mnt = &new_ns_root->mnt;
3131+
to_path.dentry = dget(new_ns_root->mnt.mnt_root);
3132+
3133+
/* Now lock for actual mounting. */
3134+
LOCK_MOUNT_EXACT(mp, &to_path);
3135+
if (unlikely(IS_ERR(mp.parent)))
3136+
return ERR_CAST(mp.parent);
3137+
3138+
/*
3139+
* We don't emulate unshare()ing a mount namespace. We stick to the
3140+
* restrictions of creating detached bind-mounts. It has a lot
3141+
* saner and simpler semantics.
3142+
*/
3143+
mnt = __do_loopback(path, flags, copy_flags);
3144+
if (IS_ERR(mnt))
3145+
return ERR_CAST(mnt);
3146+
3147+
scoped_guard(mount_writer) {
3148+
if (locked)
3149+
mnt->mnt.mnt_flags |= MNT_LOCKED;
3150+
/*
3151+
* Now mount the detached tree on top of the copy of the
3152+
* real rootfs we created.
3153+
*/
3154+
attach_mnt(mnt, new_ns_root, mp.mp);
3155+
if (user_ns != ns->user_ns)
3156+
lock_mnt_tree(new_ns_root);
3157+
}
3158+
3159+
/* Add all mounts to the new namespace. */
3160+
for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
3161+
mnt_add_to_ns(new_ns, p);
3162+
new_ns->nr_mounts++;
3163+
}
3164+
3165+
new_ns->root = real_mount(no_free_ptr(to_path.mnt));
3166+
ns_tree_add_raw(new_ns);
3167+
return no_free_ptr(new_ns);
3168+
}
3169+
3170+
static struct file *open_new_namespace(struct path *path, unsigned int flags)
3171+
{
3172+
struct mnt_namespace *new_ns;
3173+
3174+
new_ns = create_new_namespace(path, flags);
3175+
if (IS_ERR(new_ns))
3176+
return ERR_CAST(new_ns);
3177+
return open_namespace_file(to_ns_common(new_ns));
3178+
}
3179+
30643180
static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
30653181
{
30663182
int ret;
30673183
struct path path __free(path_put) = {};
30683184
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
3069-
bool detached = flags & OPEN_TREE_CLONE;
30703185

30713186
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
30723187

30733188
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
30743189
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
3075-
OPEN_TREE_CLOEXEC))
3190+
OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
30763191
return ERR_PTR(-EINVAL);
30773192

3078-
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
3193+
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
3194+
AT_RECURSIVE)
3195+
return ERR_PTR(-EINVAL);
3196+
3197+
if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
30793198
return ERR_PTR(-EINVAL);
30803199

30813200
if (flags & AT_NO_AUTOMOUNT)
@@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
30853204
if (flags & AT_EMPTY_PATH)
30863205
lookup_flags |= LOOKUP_EMPTY;
30873206

3088-
if (detached && !may_mount())
3207+
/*
3208+
* If we create a new mount namespace with the cloned mount tree we
3209+
* just care about being privileged over our current user namespace.
3210+
* The new mount namespace will be owned by it.
3211+
*/
3212+
if ((flags & OPEN_TREE_NAMESPACE) &&
3213+
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
3214+
return ERR_PTR(-EPERM);
3215+
3216+
if ((flags & OPEN_TREE_CLONE) && !may_mount())
30893217
return ERR_PTR(-EPERM);
30903218

30913219
ret = user_path_at(dfd, filename, lookup_flags, &path);
30923220
if (unlikely(ret))
30933221
return ERR_PTR(ret);
30943222

3095-
if (detached)
3096-
return open_detached_copy(&path, flags & AT_RECURSIVE);
3223+
if (flags & OPEN_TREE_NAMESPACE)
3224+
return open_new_namespace(&path, flags);
3225+
3226+
if (flags & OPEN_TREE_CLONE)
3227+
return open_detached_copy(&path, flags);
30973228

30983229
return dentry_open(&path, O_PATH, current_cred());
30993230
}

fs/nsfs.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task,
9999
return ns_get_path_cb(path, ns_get_path_task, &args);
100100
}
101101

102+
struct file *open_namespace_file(struct ns_common *ns)
103+
{
104+
struct path path __free(path_put) = {};
105+
int err;
106+
107+
/* call first to consume reference */
108+
err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
109+
if (err < 0)
110+
return ERR_PTR(err);
111+
112+
return dentry_open(&path, O_RDONLY, current_cred());
113+
}
114+
102115
/**
103116
* open_namespace - open a namespace
104117
* @ns: the namespace to open

include/uapi/linux/mount.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@
6161
/*
6262
* open_tree() flags.
6363
*/
64-
#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
64+
#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */
65+
#define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */
6566
#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
6667

6768
/*
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
open_tree_ns_test
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-License-Identifier: GPL-2.0
2+
TEST_GEN_PROGS := open_tree_ns_test
3+
4+
CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
5+
LDLIBS := -lcap
6+
7+
include ../../lib.mk
8+
9+
$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c
10+
$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)

0 commit comments

Comments
 (0)