@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
159159 free_pid (pids [tmp ]);
160160}
161161
162- struct pid * alloc_pid (struct pid_namespace * ns , pid_t * set_tid ,
163- size_t set_tid_size )
162+ struct pid * alloc_pid (struct pid_namespace * ns , pid_t * arg_set_tid ,
163+ size_t arg_set_tid_size )
164164{
165+ int set_tid [MAX_PID_NS_LEVEL + 1 ] = {};
166+ int pid_max [MAX_PID_NS_LEVEL + 1 ] = {};
165167 struct pid * pid ;
166168 enum pid_type type ;
167169 int i , nr ;
168170 struct pid_namespace * tmp ;
169171 struct upid * upid ;
170172 int retval = - ENOMEM ;
173+ bool retried_preload ;
171174
172175 /*
173- * set_tid_size contains the size of the set_tid array. Starting at
176+ * arg_set_tid_size contains the size of the arg_set_tid array. Starting at
174177 * the most nested currently active PID namespace it tells alloc_pid()
175178 * which PID to set for a process in that most nested PID namespace
176- * up to set_tid_size PID namespaces. It does not have to set the PID
177- * for a process in all nested PID namespaces but set_tid_size must
179+ * up to arg_set_tid_size PID namespaces. It does not have to set the PID
180+ * for a process in all nested PID namespaces but arg_set_tid_size must
178181 * never be greater than the current ns->level + 1.
179182 */
180- if (set_tid_size > ns -> level + 1 )
183+ if (arg_set_tid_size > ns -> level + 1 )
181184 return ERR_PTR (- EINVAL );
182185
186+ /*
187+ * Prep before we take locks:
188+ *
189+ * 1. allocate and fill in pid struct
190+ */
183191 pid = kmem_cache_alloc (ns -> pid_cachep , GFP_KERNEL );
184192 if (!pid )
185193 return ERR_PTR (retval );
186194
187- tmp = ns ;
195+ get_pid_ns ( ns ) ;
188196 pid -> level = ns -> level ;
197+ refcount_set (& pid -> count , 1 );
198+ spin_lock_init (& pid -> lock );
199+ for (type = 0 ; type < PIDTYPE_MAX ; ++ type )
200+ INIT_HLIST_HEAD (& pid -> tasks [type ]);
201+ init_waitqueue_head (& pid -> wait_pidfd );
202+ INIT_HLIST_HEAD (& pid -> inodes );
189203
190- for (i = ns -> level ; i >= 0 ; i -- ) {
191- int tid = 0 ;
192- int pid_max = READ_ONCE (tmp -> pid_max );
204+ /*
205+ * 2. perm check checkpoint_restore_ns_capable()
206+ *
207+ * This stores found pid_max to make sure the used value is the same should
208+ * later code need it.
209+ */
210+ for (tmp = ns , i = ns -> level ; i >= 0 ; i -- ) {
211+ pid_max [ns -> level - i ] = READ_ONCE (tmp -> pid_max );
193212
194- if (set_tid_size ) {
195- tid = set_tid [ns -> level - i ];
213+ if (arg_set_tid_size ) {
214+ int tid = set_tid [ ns -> level - i ] = arg_set_tid [ns -> level - i ];
196215
197216 retval = - EINVAL ;
198- if (tid < 1 || tid >= pid_max )
199- goto out_free ;
217+ if (tid < 1 || tid >= pid_max [ ns -> level - i ] )
218+ goto out_abort ;
200219 /*
201220 * Also fail if a PID != 1 is requested and
202221 * no PID 1 exists.
203222 */
204223 if (tid != 1 && !tmp -> child_reaper )
205- goto out_free ;
224+ goto out_abort ;
206225 retval = - EPERM ;
207226 if (!checkpoint_restore_ns_capable (tmp -> user_ns ))
208- goto out_free ;
209- set_tid_size -- ;
227+ goto out_abort ;
228+ arg_set_tid_size -- ;
210229 }
211230
212- idr_preload (GFP_KERNEL );
213- spin_lock (& pidmap_lock );
231+ tmp = tmp -> parent ;
232+ }
233+
234+ /*
235+ * Prep is done, id allocation goes here:
236+ */
237+ retried_preload = false;
238+ idr_preload (GFP_KERNEL );
239+ spin_lock (& pidmap_lock );
240+ for (tmp = ns , i = ns -> level ; i >= 0 ;) {
241+ int tid = set_tid [ns -> level - i ];
214242
215243 if (tid ) {
216244 nr = idr_alloc (& tmp -> idr , NULL , tid ,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
220248 * alreay in use. Return EEXIST in that case.
221249 */
222250 if (nr == - ENOSPC )
251+
223252 nr = - EEXIST ;
224253 } else {
225254 int pid_min = 1 ;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
235264 * a partially initialized PID (see below).
236265 */
237266 nr = idr_alloc_cyclic (& tmp -> idr , NULL , pid_min ,
238- pid_max , GFP_ATOMIC );
267+ pid_max [ns -> level - i ], GFP_ATOMIC );
268+ if (nr == - ENOSPC )
269+ nr = - EAGAIN ;
239270 }
240- spin_unlock (& pidmap_lock );
241- idr_preload_end ();
242271
243- if (nr < 0 ) {
244- retval = (nr == - ENOSPC ) ? - EAGAIN : nr ;
272+ if (unlikely (nr < 0 )) {
273+ /*
274+ * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
275+ *
276+ * The IDR API only allows us to preload memory for one call, while we may end
277+ * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
278+ * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
279+ * did not help (the routine unfortunately returns void, so we have no idea
280+ * if it got anywhere).
281+ *
282+ * The lock can be safely dropped and picked up as historically pid allocation
283+ * for different namespaces was *not* atomic -- we try to hold on to it the
284+ * entire time only for performance reasons.
285+ */
286+ if (nr == - ENOMEM && !retried_preload ) {
287+ spin_unlock (& pidmap_lock );
288+ idr_preload_end ();
289+ retried_preload = true;
290+ idr_preload (GFP_KERNEL );
291+ spin_lock (& pidmap_lock );
292+ continue ;
293+ }
294+ retval = nr ;
245295 goto out_free ;
246296 }
247297
248298 pid -> numbers [i ].nr = nr ;
249299 pid -> numbers [i ].ns = tmp ;
250300 tmp = tmp -> parent ;
301+ i -- ;
302+ retried_preload = false;
251303 }
252304
253305 /*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
257309 * is what we have exposed to userspace for a long time and it is
258310 * documented behavior for pid namespaces. So we can't easily
259311 * change it even if there were an error code better suited.
312+ *
313+ * This can't be done earlier because we need to preserve other
314+ * error conditions.
260315 */
261316 retval = - ENOMEM ;
262-
263- get_pid_ns (ns );
264- refcount_set (& pid -> count , 1 );
265- spin_lock_init (& pid -> lock );
266- for (type = 0 ; type < PIDTYPE_MAX ; ++ type )
267- INIT_HLIST_HEAD (& pid -> tasks [type ]);
268-
269- init_waitqueue_head (& pid -> wait_pidfd );
270- INIT_HLIST_HEAD (& pid -> inodes );
271-
272- upid = pid -> numbers + ns -> level ;
273- idr_preload (GFP_KERNEL );
274- spin_lock (& pidmap_lock );
275- if (!(ns -> pid_allocated & PIDNS_ADDING ))
276- goto out_unlock ;
317+ if (unlikely (!(ns -> pid_allocated & PIDNS_ADDING )))
318+ goto out_free ;
277319 pidfs_add_pid (pid );
278- for ( ; upid >= pid -> numbers ; -- upid ) {
320+ for (upid = pid -> numbers + ns -> level ; upid >= pid -> numbers ; -- upid ) {
279321 /* Make the PID visible to find_pid_ns. */
280322 idr_replace (& upid -> ns -> idr , pid , upid -> nr );
281323 upid -> ns -> pid_allocated ++ ;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
286328
287329 return pid ;
288330
289- out_unlock :
290- spin_unlock (& pidmap_lock );
291- idr_preload_end ();
292- put_pid_ns (ns );
293-
294331out_free :
295- spin_lock (& pidmap_lock );
296332 while (++ i <= ns -> level ) {
297333 upid = pid -> numbers + i ;
298334 idr_remove (& upid -> ns -> idr , upid -> nr );
@@ -303,7 +339,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
303339 idr_set_cursor (& ns -> idr , 0 );
304340
305341 spin_unlock (& pidmap_lock );
342+ idr_preload_end ();
306343
344+ out_abort :
345+ put_pid_ns (ns );
307346 kmem_cache_free (ns -> pid_cachep , pid );
308347 return ERR_PTR (retval );
309348}
0 commit comments