Skip to content

Commit 961b2ad

Browse files
committed
Merge patch series "further damage-control lack of clone scalability"
Mateusz Guzik <mjguzik@gmail.com> says: When spawning and killing threads in separate processes in parallel the primary bottleneck on the stock kernel is pidmap_lock, largely because of a back-to-back acquire in the common case. Benchmark code at the end. With this patchset alloc_pid() only takes the lock once and consequently alleviates the problem. While scalability improves, the lock remains the primary bottleneck by a large margin. I believe idr is a poor choice for the task at hand to begin with, but sorting out that out beyond the scope of this patchset. At the same time any replacement would be best evaluated against a state where the above relock problem is fixed. Performance improvement varies between reboots. When benchmarking with 20 processes creating and killing threads in a loop, the unpatched baseline hovers around 465k ops/s, while patched is anything between ~510k ops/s and ~560k depending on false-sharing (which I only minimally sanitized). So this is at least 10% if you are unlucky. bench from will-it-scale: char *testcase_description = "Thread creation and teardown"; static void *worker(void *arg) { return (NULL); } void testcase(unsigned long long *iterations, unsigned long nr) { pthread_t thread[1]; int error; while (1) { for (int i = 0; i < 1; i++) { error = pthread_create(&thread[i], NULL, worker, NULL); assert(error == 0); } for (int i = 0; i < 1; i++) { error = pthread_join(thread[i], NULL); assert(error == 0); } (*iterations)++; } } v2: - cosmetic fixes from Oleg - drop idr_preload_many, relock pidmap + call idr_preload again instead - write a commit message for the alloc pid patch * patches from https://patch.msgid.link/20251203092851.287617-1-mjguzik@gmail.com: pid: only take pidmap_lock once on alloc ns: pad refcount Link: https://patch.msgid.link/20251203092851.287617-1-mjguzik@gmail.com Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 887e977 + 6d864a1 commit 961b2ad

File tree

2 files changed

+88
-47
lines changed

2 files changed

+88
-47
lines changed

include/linux/ns/ns_common_types.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,13 @@ extern const struct proc_ns_operations utsns_operations;
108108
* @ns_tree: namespace tree nodes and active reference count
109109
*/
110110
struct ns_common {
111+
struct {
112+
refcount_t __ns_ref; /* do not use directly */
113+
} ____cacheline_aligned_in_smp;
111114
u32 ns_type;
112115
struct dentry *stashed;
113116
const struct proc_ns_operations *ops;
114117
unsigned int inum;
115-
refcount_t __ns_ref; /* do not use directly */
116118
union {
117119
struct ns_tree;
118120
struct rcu_head ns_rcu;

kernel/pid.c

Lines changed: 85 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
159159
free_pid(pids[tmp]);
160160
}
161161

162-
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
163-
size_t set_tid_size)
162+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
163+
size_t arg_set_tid_size)
164164
{
165+
int set_tid[MAX_PID_NS_LEVEL + 1] = {};
166+
int pid_max[MAX_PID_NS_LEVEL + 1] = {};
165167
struct pid *pid;
166168
enum pid_type type;
167169
int i, nr;
168170
struct pid_namespace *tmp;
169171
struct upid *upid;
170172
int retval = -ENOMEM;
173+
bool retried_preload;
171174

172175
/*
173-
* set_tid_size contains the size of the set_tid array. Starting at
176+
* arg_set_tid_size contains the size of the arg_set_tid array. Starting at
174177
* the most nested currently active PID namespace it tells alloc_pid()
175178
* which PID to set for a process in that most nested PID namespace
176-
* up to set_tid_size PID namespaces. It does not have to set the PID
177-
* for a process in all nested PID namespaces but set_tid_size must
179+
* up to arg_set_tid_size PID namespaces. It does not have to set the PID
180+
* for a process in all nested PID namespaces but arg_set_tid_size must
178181
* never be greater than the current ns->level + 1.
179182
*/
180-
if (set_tid_size > ns->level + 1)
183+
if (arg_set_tid_size > ns->level + 1)
181184
return ERR_PTR(-EINVAL);
182185

186+
/*
187+
* Prep before we take locks:
188+
*
189+
* 1. allocate and fill in pid struct
190+
*/
183191
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
184192
if (!pid)
185193
return ERR_PTR(retval);
186194

187-
tmp = ns;
195+
get_pid_ns(ns);
188196
pid->level = ns->level;
197+
refcount_set(&pid->count, 1);
198+
spin_lock_init(&pid->lock);
199+
for (type = 0; type < PIDTYPE_MAX; ++type)
200+
INIT_HLIST_HEAD(&pid->tasks[type]);
201+
init_waitqueue_head(&pid->wait_pidfd);
202+
INIT_HLIST_HEAD(&pid->inodes);
189203

190-
for (i = ns->level; i >= 0; i--) {
191-
int tid = 0;
192-
int pid_max = READ_ONCE(tmp->pid_max);
204+
/*
205+
* 2. perm check checkpoint_restore_ns_capable()
206+
*
207+
* This stores found pid_max to make sure the used value is the same should
208+
* later code need it.
209+
*/
210+
for (tmp = ns, i = ns->level; i >= 0; i--) {
211+
pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
193212

194-
if (set_tid_size) {
195-
tid = set_tid[ns->level - i];
213+
if (arg_set_tid_size) {
214+
int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
196215

197216
retval = -EINVAL;
198-
if (tid < 1 || tid >= pid_max)
199-
goto out_free;
217+
if (tid < 1 || tid >= pid_max[ns->level - i])
218+
goto out_abort;
200219
/*
201220
* Also fail if a PID != 1 is requested and
202221
* no PID 1 exists.
203222
*/
204223
if (tid != 1 && !tmp->child_reaper)
205-
goto out_free;
224+
goto out_abort;
206225
retval = -EPERM;
207226
if (!checkpoint_restore_ns_capable(tmp->user_ns))
208-
goto out_free;
209-
set_tid_size--;
227+
goto out_abort;
228+
arg_set_tid_size--;
210229
}
211230

212-
idr_preload(GFP_KERNEL);
213-
spin_lock(&pidmap_lock);
231+
tmp = tmp->parent;
232+
}
233+
234+
/*
235+
* Prep is done, id allocation goes here:
236+
*/
237+
retried_preload = false;
238+
idr_preload(GFP_KERNEL);
239+
spin_lock(&pidmap_lock);
240+
for (tmp = ns, i = ns->level; i >= 0;) {
241+
int tid = set_tid[ns->level - i];
214242

215243
if (tid) {
216244
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
220248
* alreay in use. Return EEXIST in that case.
221249
*/
222250
if (nr == -ENOSPC)
251+
223252
nr = -EEXIST;
224253
} else {
225254
int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
235264
* a partially initialized PID (see below).
236265
*/
237266
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
238-
pid_max, GFP_ATOMIC);
267+
pid_max[ns->level - i], GFP_ATOMIC);
268+
if (nr == -ENOSPC)
269+
nr = -EAGAIN;
239270
}
240-
spin_unlock(&pidmap_lock);
241-
idr_preload_end();
242271

243-
if (nr < 0) {
244-
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
272+
if (unlikely(nr < 0)) {
273+
/*
274+
* Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
275+
*
276+
* The IDR API only allows us to preload memory for one call, while we may end
277+
* up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
278+
* salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
279+
* did not help (the routine unfortunately returns void, so we have no idea
280+
* if it got anywhere).
281+
*
282+
* The lock can be safely dropped and picked up as historically pid allocation
283+
* for different namespaces was *not* atomic -- we try to hold on to it the
284+
* entire time only for performance reasons.
285+
*/
286+
if (nr == -ENOMEM && !retried_preload) {
287+
spin_unlock(&pidmap_lock);
288+
idr_preload_end();
289+
retried_preload = true;
290+
idr_preload(GFP_KERNEL);
291+
spin_lock(&pidmap_lock);
292+
continue;
293+
}
294+
retval = nr;
245295
goto out_free;
246296
}
247297

248298
pid->numbers[i].nr = nr;
249299
pid->numbers[i].ns = tmp;
250300
tmp = tmp->parent;
301+
i--;
302+
retried_preload = false;
251303
}
252304

253305
/*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
257309
* is what we have exposed to userspace for a long time and it is
258310
* documented behavior for pid namespaces. So we can't easily
259311
* change it even if there were an error code better suited.
312+
*
313+
* This can't be done earlier because we need to preserve other
314+
* error conditions.
260315
*/
261316
retval = -ENOMEM;
262-
263-
get_pid_ns(ns);
264-
refcount_set(&pid->count, 1);
265-
spin_lock_init(&pid->lock);
266-
for (type = 0; type < PIDTYPE_MAX; ++type)
267-
INIT_HLIST_HEAD(&pid->tasks[type]);
268-
269-
init_waitqueue_head(&pid->wait_pidfd);
270-
INIT_HLIST_HEAD(&pid->inodes);
271-
272-
upid = pid->numbers + ns->level;
273-
idr_preload(GFP_KERNEL);
274-
spin_lock(&pidmap_lock);
275-
if (!(ns->pid_allocated & PIDNS_ADDING))
276-
goto out_unlock;
317+
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
318+
goto out_free;
277319
pidfs_add_pid(pid);
278-
for ( ; upid >= pid->numbers; --upid) {
320+
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
279321
/* Make the PID visible to find_pid_ns. */
280322
idr_replace(&upid->ns->idr, pid, upid->nr);
281323
upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
286328

287329
return pid;
288330

289-
out_unlock:
290-
spin_unlock(&pidmap_lock);
291-
idr_preload_end();
292-
put_pid_ns(ns);
293-
294331
out_free:
295-
spin_lock(&pidmap_lock);
296332
while (++i <= ns->level) {
297333
upid = pid->numbers + i;
298334
idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
303339
idr_set_cursor(&ns->idr, 0);
304340

305341
spin_unlock(&pidmap_lock);
342+
idr_preload_end();
306343

344+
out_abort:
345+
put_pid_ns(ns);
307346
kmem_cache_free(ns->pid_cachep, pid);
308347
return ERR_PTR(retval);
309348
}

0 commit comments

Comments
 (0)