Skip to content

Commit a41dbf5

Browse files
committed
mount: hold namespace_sem across copy in create_new_namespace()
Fix an oversight when creating a new mount namespace. If someone had the bright idea to make the real rootfs a shared or dependent mount and it is later copied the copy will become a peer of the old real rootfs mount or a dependent mount of it. The namespace semaphore is dropped and we use mount lock exact to lock the new real root mount. If that fails or the subsequent do_loopback() fails we rely on the copy of the real root mount to be cleaned up by path_put(). The problem is that this doesn't deal with mount propagation and will leave the mounts linked in the propagation lists. When creating a new mount namespace create_new_namespace() first acquires namespace_sem to clone the nullfs root, drops it, then reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to respect the inode_lock -> namespace_sem lock ordering. This drop-and-reacquire pattern is fragile and was the source of the propagation cleanup bug fixed in the preceding commit. Extend lock_mount_exact() with a copy_mount mode that clones the mount under the locks atomically. When copy_mount is true, path_overmounted() is skipped since we're copying the mount, not mounting on top of it - the nullfs root always has rootfs mounted on top so the check would always fail. If clone_mnt() fails after get_mountpoint() has pinned the mountpoint, __unlock_mount() is used to properly unpin the mountpoint and release both locks. This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which takes inode_lock and namespace_sem once and holds them throughout the clone and subsequent mount operations, eliminating the drop-and-reacquire pattern entirely. Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com Fixes: 9b8a0ba ("mount: add OPEN_TREE_NAMESPACE") # mainline only Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent ac83896 commit a41dbf5

File tree

1 file changed

+57
-54
lines changed

1 file changed

+57
-54
lines changed

fs/namespace.c

Lines changed: 57 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2791,15 +2791,19 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
27912791
}
27922792

27932793
static void lock_mount_exact(const struct path *path,
2794-
struct pinned_mountpoint *mp);
2794+
struct pinned_mountpoint *mp, bool copy_mount,
2795+
unsigned int copy_flags);
27952796

27962797
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
27972798
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
27982799
do_lock_mount((path), &mp, (beneath))
27992800
#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
28002801
#define LOCK_MOUNT_EXACT(mp, path) \
28012802
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2802-
lock_mount_exact((path), &mp)
2803+
lock_mount_exact((path), &mp, false, 0)
2804+
#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
2805+
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2806+
lock_mount_exact((path), &mp, true, (copy_flags))
28032807

28042808
static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
28052809
{
@@ -3073,16 +3077,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
30733077
return file;
30743078
}
30753079

3076-
DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
3077-
if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
3078-
30793080
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
30803081
{
3081-
struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
3082-
struct path to_path __free(path_put) = {};
30833082
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
30843083
struct user_namespace *user_ns = current_user_ns();
3085-
struct mount *new_ns_root;
3084+
struct mnt_namespace *new_ns;
3085+
struct mount *new_ns_root, *old_ns_root;
3086+
struct path to_path;
30863087
struct mount *mnt;
30873088
unsigned int copy_flags = 0;
30883089
bool locked = false;
@@ -3094,71 +3095,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
30943095
if (IS_ERR(new_ns))
30953096
return ERR_CAST(new_ns);
30963097

3097-
scoped_guard(namespace_excl) {
3098-
new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
3099-
if (IS_ERR(new_ns_root))
3100-
return ERR_CAST(new_ns_root);
3098+
old_ns_root = ns->root;
3099+
to_path.mnt = &old_ns_root->mnt;
3100+
to_path.dentry = old_ns_root->mnt.mnt_root;
31013101

3102-
/*
3103-
* If the real rootfs had a locked mount on top of it somewhere
3104-
* in the stack, lock the new mount tree as well so it can't be
3105-
* exposed.
3106-
*/
3107-
mnt = ns->root;
3108-
while (mnt->overmount) {
3109-
mnt = mnt->overmount;
3110-
if (mnt->mnt.mnt_flags & MNT_LOCKED)
3111-
locked = true;
3112-
}
3102+
VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);
3103+
3104+
LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
3105+
if (IS_ERR(mp.parent)) {
3106+
free_mnt_ns(new_ns);
3107+
return ERR_CAST(mp.parent);
31133108
}
3109+
new_ns_root = mp.parent;
31143110

31153111
/*
3116-
* We dropped the namespace semaphore so we can actually lock
3117-
* the copy for mounting. The copied mount isn't attached to any
3118-
* mount namespace and it is thus excluded from any propagation.
3119-
* So realistically we're isolated and the mount can't be
3120-
* overmounted.
3112+
* If the real rootfs had a locked mount on top of it somewhere
3113+
* in the stack, lock the new mount tree as well so it can't be
3114+
* exposed.
31213115
*/
3122-
3123-
/* Borrow the reference from clone_mnt(). */
3124-
to_path.mnt = &new_ns_root->mnt;
3125-
to_path.dentry = dget(new_ns_root->mnt.mnt_root);
3126-
3127-
/* Now lock for actual mounting. */
3128-
LOCK_MOUNT_EXACT(mp, &to_path);
3129-
if (unlikely(IS_ERR(mp.parent)))
3130-
return ERR_CAST(mp.parent);
3116+
mnt = old_ns_root;
3117+
while (mnt->overmount) {
3118+
mnt = mnt->overmount;
3119+
if (mnt->mnt.mnt_flags & MNT_LOCKED)
3120+
locked = true;
3121+
}
31313122

31323123
/*
3133-
* We don't emulate unshare()ing a mount namespace. We stick to the
3134-
* restrictions of creating detached bind-mounts. It has a lot
3135-
* saner and simpler semantics.
3124+
* We don't emulate unshare()ing a mount namespace. We stick
3125+
* to the restrictions of creating detached bind-mounts. It
3126+
* has a lot saner and simpler semantics.
31363127
*/
31373128
mnt = __do_loopback(path, flags, copy_flags);
3138-
if (IS_ERR(mnt))
3139-
return ERR_CAST(mnt);
3140-
31413129
scoped_guard(mount_writer) {
3130+
if (IS_ERR(mnt)) {
3131+
emptied_ns = new_ns;
3132+
umount_tree(new_ns_root, 0);
3133+
return ERR_CAST(mnt);
3134+
}
3135+
31423136
if (locked)
31433137
mnt->mnt.mnt_flags |= MNT_LOCKED;
31443138
/*
3145-
* Now mount the detached tree on top of the copy of the
3146-
* real rootfs we created.
3139+
* now mount the detached tree on top of the copy
3140+
* of the real rootfs we created.
31473141
*/
31483142
attach_mnt(mnt, new_ns_root, mp.mp);
31493143
if (user_ns != ns->user_ns)
31503144
lock_mnt_tree(new_ns_root);
31513145
}
31523146

3153-
/* Add all mounts to the new namespace. */
3154-
for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
3155-
mnt_add_to_ns(new_ns, p);
3147+
for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
3148+
mnt_add_to_ns(new_ns, mnt);
31563149
new_ns->nr_mounts++;
31573150
}
31583151

3159-
new_ns->root = real_mount(no_free_ptr(to_path.mnt));
3152+
new_ns->root = new_ns_root;
31603153
ns_tree_add_raw(new_ns);
3161-
return no_free_ptr(new_ns);
3154+
return new_ns;
31623155
}
31633156

31643157
static struct file *open_new_namespace(struct path *path, unsigned int flags)
@@ -3840,26 +3833,36 @@ static int do_new_mount(const struct path *path, const char *fstype,
38403833
}
38413834

38423835
static void lock_mount_exact(const struct path *path,
3843-
struct pinned_mountpoint *mp)
3836+
struct pinned_mountpoint *mp, bool copy_mount,
3837+
unsigned int copy_flags)
38443838
{
38453839
struct dentry *dentry = path->dentry;
38463840
int err;
38473841

3842+
/* Assert that inode_lock() locked the correct inode. */
3843+
VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));
3844+
38483845
inode_lock(dentry->d_inode);
38493846
namespace_lock();
38503847
if (unlikely(cant_mount(dentry)))
38513848
err = -ENOENT;
3852-
else if (path_overmounted(path))
3849+
else if (!copy_mount && path_overmounted(path))
38533850
err = -EBUSY;
38543851
else
38553852
err = get_mountpoint(dentry, mp);
38563853
if (unlikely(err)) {
38573854
namespace_unlock();
38583855
inode_unlock(dentry->d_inode);
38593856
mp->parent = ERR_PTR(err);
3860-
} else {
3861-
mp->parent = real_mount(path->mnt);
3857+
return;
38623858
}
3859+
3860+
if (copy_mount)
3861+
mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
3862+
else
3863+
mp->parent = real_mount(path->mnt);
3864+
if (unlikely(IS_ERR(mp->parent)))
3865+
__unlock_mount(mp);
38633866
}
38643867

38653868
int finish_automount(struct vfsmount *__m, const struct path *path)

0 commit comments

Comments
 (0)