Skip to content

Commit 6d864a1

Browse files
mjguzikbrauner
authored andcommitted
pid: only take pidmap_lock once on alloc
When spawning and killing threads in separate processes in parallel the primary bottleneck on the stock kernel is pidmap_lock, largely because of a back-to-back acquire in the common case. This aspect is fixed with the patch. Performance improvement varies between reboots. When benchmarking with 20 processes creating and killing threads in a loop, the unpatched baseline hovers around 465k ops/s, while patched is anything between ~510k ops/s and ~560k depending on false-sharing (which I only minimally sanitized). So this is at least 10% if you are unlucky. The change also facilitated some cosmetic fixes. It has an unintentional side effect of no longer issuing spurious idr_preload() around idr_replace(). Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Link: https://patch.msgid.link/20251203092851.287617-3-mjguzik@gmail.com Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent c0aac59 commit 6d864a1

File tree

1 file changed

+85
-46
lines changed

1 file changed

+85
-46
lines changed

kernel/pid.c

Lines changed: 85 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
159159
free_pid(pids[tmp]);
160160
}
161161

162-
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
163-
size_t set_tid_size)
162+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
163+
size_t arg_set_tid_size)
164164
{
165+
int set_tid[MAX_PID_NS_LEVEL + 1] = {};
166+
int pid_max[MAX_PID_NS_LEVEL + 1] = {};
165167
struct pid *pid;
166168
enum pid_type type;
167169
int i, nr;
168170
struct pid_namespace *tmp;
169171
struct upid *upid;
170172
int retval = -ENOMEM;
173+
bool retried_preload;
171174

172175
/*
173-
* set_tid_size contains the size of the set_tid array. Starting at
176+
* arg_set_tid_size contains the size of the arg_set_tid array. Starting at
174177
* the most nested currently active PID namespace it tells alloc_pid()
175178
* which PID to set for a process in that most nested PID namespace
176-
* up to set_tid_size PID namespaces. It does not have to set the PID
177-
* for a process in all nested PID namespaces but set_tid_size must
179+
* up to arg_set_tid_size PID namespaces. It does not have to set the PID
180+
* for a process in all nested PID namespaces but arg_set_tid_size must
178181
* never be greater than the current ns->level + 1.
179182
*/
180-
if (set_tid_size > ns->level + 1)
183+
if (arg_set_tid_size > ns->level + 1)
181184
return ERR_PTR(-EINVAL);
182185

186+
/*
187+
* Prep before we take locks:
188+
*
189+
* 1. allocate and fill in pid struct
190+
*/
183191
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
184192
if (!pid)
185193
return ERR_PTR(retval);
186194

187-
tmp = ns;
195+
get_pid_ns(ns);
188196
pid->level = ns->level;
197+
refcount_set(&pid->count, 1);
198+
spin_lock_init(&pid->lock);
199+
for (type = 0; type < PIDTYPE_MAX; ++type)
200+
INIT_HLIST_HEAD(&pid->tasks[type]);
201+
init_waitqueue_head(&pid->wait_pidfd);
202+
INIT_HLIST_HEAD(&pid->inodes);
189203

190-
for (i = ns->level; i >= 0; i--) {
191-
int tid = 0;
192-
int pid_max = READ_ONCE(tmp->pid_max);
204+
/*
205+
* 2. perm check checkpoint_restore_ns_capable()
206+
*
207+
* This stores found pid_max to make sure the used value is the same should
208+
* later code need it.
209+
*/
210+
for (tmp = ns, i = ns->level; i >= 0; i--) {
211+
pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
193212

194-
if (set_tid_size) {
195-
tid = set_tid[ns->level - i];
213+
if (arg_set_tid_size) {
214+
int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
196215

197216
retval = -EINVAL;
198-
if (tid < 1 || tid >= pid_max)
199-
goto out_free;
217+
if (tid < 1 || tid >= pid_max[ns->level - i])
218+
goto out_abort;
200219
/*
201220
* Also fail if a PID != 1 is requested and
202221
* no PID 1 exists.
203222
*/
204223
if (tid != 1 && !tmp->child_reaper)
205-
goto out_free;
224+
goto out_abort;
206225
retval = -EPERM;
207226
if (!checkpoint_restore_ns_capable(tmp->user_ns))
208-
goto out_free;
209-
set_tid_size--;
227+
goto out_abort;
228+
arg_set_tid_size--;
210229
}
211230

212-
idr_preload(GFP_KERNEL);
213-
spin_lock(&pidmap_lock);
231+
tmp = tmp->parent;
232+
}
233+
234+
/*
235+
* Prep is done, id allocation goes here:
236+
*/
237+
retried_preload = false;
238+
idr_preload(GFP_KERNEL);
239+
spin_lock(&pidmap_lock);
240+
for (tmp = ns, i = ns->level; i >= 0;) {
241+
int tid = set_tid[ns->level - i];
214242

215243
if (tid) {
216244
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
220248
* alreay in use. Return EEXIST in that case.
221249
*/
222250
if (nr == -ENOSPC)
251+
223252
nr = -EEXIST;
224253
} else {
225254
int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
235264
* a partially initialized PID (see below).
236265
*/
237266
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
238-
pid_max, GFP_ATOMIC);
267+
pid_max[ns->level - i], GFP_ATOMIC);
268+
if (nr == -ENOSPC)
269+
nr = -EAGAIN;
239270
}
240-
spin_unlock(&pidmap_lock);
241-
idr_preload_end();
242271

243-
if (nr < 0) {
244-
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
272+
if (unlikely(nr < 0)) {
273+
/*
274+
* Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
275+
*
276+
* The IDR API only allows us to preload memory for one call, while we may end
277+
* up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
278+
* salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
279+
* did not help (the routine unfortunately returns void, so we have no idea
280+
* if it got anywhere).
281+
*
282+
* The lock can be safely dropped and picked up as historically pid allocation
283+
* for different namespaces was *not* atomic -- we try to hold on to it the
284+
* entire time only for performance reasons.
285+
*/
286+
if (nr == -ENOMEM && !retried_preload) {
287+
spin_unlock(&pidmap_lock);
288+
idr_preload_end();
289+
retried_preload = true;
290+
idr_preload(GFP_KERNEL);
291+
spin_lock(&pidmap_lock);
292+
continue;
293+
}
294+
retval = nr;
245295
goto out_free;
246296
}
247297

248298
pid->numbers[i].nr = nr;
249299
pid->numbers[i].ns = tmp;
250300
tmp = tmp->parent;
301+
i--;
302+
retried_preload = false;
251303
}
252304

253305
/*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
257309
* is what we have exposed to userspace for a long time and it is
258310
* documented behavior for pid namespaces. So we can't easily
259311
* change it even if there were an error code better suited.
312+
*
313+
* This can't be done earlier because we need to preserve other
314+
* error conditions.
260315
*/
261316
retval = -ENOMEM;
262-
263-
get_pid_ns(ns);
264-
refcount_set(&pid->count, 1);
265-
spin_lock_init(&pid->lock);
266-
for (type = 0; type < PIDTYPE_MAX; ++type)
267-
INIT_HLIST_HEAD(&pid->tasks[type]);
268-
269-
init_waitqueue_head(&pid->wait_pidfd);
270-
INIT_HLIST_HEAD(&pid->inodes);
271-
272-
upid = pid->numbers + ns->level;
273-
idr_preload(GFP_KERNEL);
274-
spin_lock(&pidmap_lock);
275-
if (!(ns->pid_allocated & PIDNS_ADDING))
276-
goto out_unlock;
317+
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
318+
goto out_free;
277319
pidfs_add_pid(pid);
278-
for ( ; upid >= pid->numbers; --upid) {
320+
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
279321
/* Make the PID visible to find_pid_ns. */
280322
idr_replace(&upid->ns->idr, pid, upid->nr);
281323
upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
286328

287329
return pid;
288330

289-
out_unlock:
290-
spin_unlock(&pidmap_lock);
291-
idr_preload_end();
292-
put_pid_ns(ns);
293-
294331
out_free:
295-
spin_lock(&pidmap_lock);
296332
while (++i <= ns->level) {
297333
upid = pid->numbers + i;
298334
idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
303339
idr_set_cursor(&ns->idr, 0);
304340

305341
spin_unlock(&pidmap_lock);
342+
idr_preload_end();
306343

344+
out_abort:
345+
put_pid_ns(ns);
307346
kmem_cache_free(ns->pid_cachep, pid);
308347
return ERR_PTR(retval);
309348
}

0 commit comments

Comments
 (0)