Skip to content

Commit 87caaee

Browse files
mjguzikbrauner
authored andcommitted
pidfs: implement ino allocation without the pidmap lock
This paves the way for scalable PID allocation later. The 32 bit variant merely takes a spinlock for simplicity, the 64 bit variant uses a scalable scheme. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com Co-developed-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 03aef06 commit 87caaee

File tree

2 files changed

+73
-43
lines changed

2 files changed

+73
-43
lines changed

fs/pidfs.c

Lines changed: 72 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <linux/coredump.h>
2424
#include <linux/rhashtable.h>
2525
#include <linux/xattr.h>
26+
#include <linux/cookie.h>
2627

2728
#include "internal.h"
2829
#include "mount.h"
@@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
6566
.automatic_shrinking = true,
6667
};
6768

69+
/*
70+
* inode number handling
71+
*
72+
* On 64 bit nothing special happens. The 64bit number assigned
73+
* to struct pid is the inode number.
74+
*
75+
* On 32 bit the 64 bit number assigned to struct pid is split
76+
* into two 32 bit numbers. The lower 32 bits are used as the
77+
* inode number and the upper 32 bits are used as the inode
78+
* generation number.
79+
*
80+
* On 32 bit pidfs_ino() will return the lower 32 bit. When
81+
* pidfs_ino() returns zero a wrap around happened. When a
82+
* wraparound happens the 64 bit number will be incremented by 1
83+
* so inode numbering starts at 1 again.
84+
*
85+
* On 64 bit comparing two pidfds is as simple as comparing
86+
* inode numbers.
87+
*
88+
* When a wraparound happens on 32 bit multiple pidfds with the
89+
* same inode number are likely to exist (This isn't a problem
90+
* since before pidfs pidfds used the anonymous inode meaning
91+
* all pidfds had the same inode number.). Userspace can
92+
* reconstruct the 64 bit identifier by retrieving both the
93+
* inode number and the inode generation number to compare or
94+
* use file handles.
95+
*/
96+
6897
#if BITS_PER_LONG == 32
98+
99+
DEFINE_SPINLOCK(pidfs_ino_lock);
100+
static u64 pidfs_ino_nr = 1;
101+
69102
static inline unsigned long pidfs_ino(u64 ino)
70103
{
71104
return lower_32_bits(ino);
@@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
77110
return upper_32_bits(ino);
78111
}
79112

113+
static inline u64 pidfs_alloc_ino(void)
114+
{
115+
u64 ino;
116+
117+
spin_lock(&pidfs_ino_lock);
118+
if (pidfs_ino(pidfs_ino_nr) == 0)
119+
pidfs_ino_nr++;
120+
ino = pidfs_ino_nr++;
121+
spin_unlock(&pidfs_ino_lock);
122+
return ino;
123+
}
124+
80125
#else
81126

82127
/* On 64 bit simply return ino. */
@@ -90,61 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
90135
{
91136
return 0;
92137
}
93-
#endif
94138

95-
/*
96-
* Allocate inode number and initialize pidfs fields.
97-
* Called with pidmap_lock held.
98-
*/
99-
void pidfs_prepare_pid(struct pid *pid)
139+
DEFINE_COOKIE(pidfs_ino_cookie);
140+
141+
static u64 pidfs_alloc_ino(void)
100142
{
101-
static u64 pidfs_ino_nr = 2;
143+
u64 ino;
102144

103-
/*
104-
* On 64 bit nothing special happens. The 64bit number assigned
105-
* to struct pid is the inode number.
106-
*
107-
* On 32 bit the 64 bit number assigned to struct pid is split
108-
* into two 32 bit numbers. The lower 32 bits are used as the
109-
* inode number and the upper 32 bits are used as the inode
110-
* generation number.
111-
*
112-
* On 32 bit pidfs_ino() will return the lower 32 bit. When
113-
* pidfs_ino() returns zero a wrap around happened. When a
114-
* wraparound happens the 64 bit number will be incremented by 2
115-
* so inode numbering starts at 2 again.
116-
*
117-
* On 64 bit comparing two pidfds is as simple as comparing
118-
* inode numbers.
119-
*
120-
* When a wraparound happens on 32 bit multiple pidfds with the
121-
* same inode number are likely to exist (This isn't a problem
122-
* since before pidfs pidfds used the anonymous inode meaning
123-
* all pidfds had the same inode number.). Userspace can
124-
* reconstruct the 64 bit identifier by retrieving both the
125-
* inode number and the inode generation number to compare or
126-
* use file handles.
127-
*/
128-
if (pidfs_ino(pidfs_ino_nr) == 0)
129-
pidfs_ino_nr += 2;
145+
preempt_disable();
146+
ino = gen_cookie_next(&pidfs_ino_cookie);
147+
preempt_enable();
148+
149+
VFS_WARN_ON_ONCE(ino < 1);
150+
return ino;
151+
}
152+
153+
#endif
130154

131-
pid->ino = pidfs_ino_nr;
132-
pid->pidfs_hash.next = NULL;
155+
void pidfs_prepare_pid(struct pid *pid)
156+
{
133157
pid->stashed = NULL;
134158
pid->attr = NULL;
135-
pidfs_ino_nr++;
159+
pid->ino = 0;
136160
}
137161

138162
int pidfs_add_pid(struct pid *pid)
139163
{
140-
return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
141-
pidfs_ino_ht_params);
164+
int ret;
165+
166+
pid->ino = pidfs_alloc_ino();
167+
ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
168+
pidfs_ino_ht_params);
169+
if (unlikely(ret))
170+
pid->ino = 0;
171+
return ret;
142172
}
143173

144174
void pidfs_remove_pid(struct pid *pid)
145175
{
146-
rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
147-
pidfs_ino_ht_params);
176+
if (likely(pid->ino))
177+
rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
178+
pidfs_ino_ht_params);
148179
}
149180

150181
void pidfs_free_pid(struct pid *pid)

kernel/pid.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
198198
INIT_HLIST_HEAD(&pid->tasks[type]);
199199
init_waitqueue_head(&pid->wait_pidfd);
200200
INIT_HLIST_HEAD(&pid->inodes);
201+
pidfs_prepare_pid(pid);
201202

202203
/*
203204
* 2. perm check checkpoint_restore_ns_capable()
@@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
314315
retval = -ENOMEM;
315316
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
316317
goto out_free;
317-
pidfs_prepare_pid(pid);
318-
319318
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
320319
/* Make the PID visible to find_pid_ns. */
321320
idr_replace(&upid->ns->idr, pid, upid->nr);

0 commit comments

Comments
 (0)