Skip to content

Commit 8021824

Browse files
committed
pidfs: convert rb-tree to rhashtable
Mateusz reported performance penalties [1] during task creation because pidfs uses pidmap_lock to add elements into the rbtree. Switch to an rhashtable to have separate fine-grained locking and to decouple from pidmap_lock moving all heavy manipulations outside of it. Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount protection to an rhashtable. This removes the global pidmap_lock contention from pidfs_ino_get_pid() lookups and allows the hashtable insert to happen outside the pidmap_lock. pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and initializes pid fields and is called inside pidmap_lock. pidfs_add_pid() inserts pid into rhashtable and is called outside pidmap_lock. Insertion into the rhashtable can fail and memory allocation may happen so we need to drop the spinlock. To guard against accidently opening an already reaped task pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or it already went through pidfs_exit() aka the process as already reaped. If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether the task has exited. This slightly changes visibility semantics: pidfd creation is denied after pidfs_exit() runs, which is just before the pid number is removed from the via free_pid(). That should not be an issue though. Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1] Link: https://patch.msgid.link/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent a344860 commit 8021824

File tree

4 files changed

+46
-55
lines changed

4 files changed

+46
-55
lines changed

fs/pidfs.c

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <linux/utsname.h>
2222
#include <net/net_namespace.h>
2323
#include <linux/coredump.h>
24+
#include <linux/rhashtable.h>
2425
#include <linux/xattr.h>
2526

2627
#include "internal.h"
@@ -55,7 +56,14 @@ struct pidfs_attr {
5556
__u32 coredump_signal;
5657
};
5758

58-
static struct rb_root pidfs_ino_tree = RB_ROOT;
59+
static struct rhashtable pidfs_ino_ht;
60+
61+
static const struct rhashtable_params pidfs_ino_ht_params = {
62+
.key_offset = offsetof(struct pid, ino),
63+
.key_len = sizeof(u64),
64+
.head_offset = offsetof(struct pid, pidfs_hash),
65+
.automatic_shrinking = true,
66+
};
5967

6068
#if BITS_PER_LONG == 32
6169
static inline unsigned long pidfs_ino(u64 ino)
@@ -84,21 +92,11 @@ static inline u32 pidfs_gen(u64 ino)
8492
}
8593
#endif
8694

87-
static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
88-
{
89-
struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
90-
struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
91-
u64 pid_ino_a = pid_a->ino;
92-
u64 pid_ino_b = pid_b->ino;
93-
94-
if (pid_ino_a < pid_ino_b)
95-
return -1;
96-
if (pid_ino_a > pid_ino_b)
97-
return 1;
98-
return 0;
99-
}
100-
101-
void pidfs_add_pid(struct pid *pid)
95+
/*
96+
* Allocate inode number and initialize pidfs fields.
97+
* Called with pidmap_lock held.
98+
*/
99+
void pidfs_prepare_pid(struct pid *pid)
102100
{
103101
static u64 pidfs_ino_nr = 2;
104102

@@ -131,20 +129,22 @@ void pidfs_add_pid(struct pid *pid)
131129
pidfs_ino_nr += 2;
132130

133131
pid->ino = pidfs_ino_nr;
132+
pid->pidfs_hash.next = NULL;
134133
pid->stashed = NULL;
135134
pid->attr = NULL;
136135
pidfs_ino_nr++;
136+
}
137137

138-
write_seqcount_begin(&pidmap_lock_seq);
139-
rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
140-
write_seqcount_end(&pidmap_lock_seq);
138+
int pidfs_add_pid(struct pid *pid)
139+
{
140+
return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
141+
pidfs_ino_ht_params);
141142
}
142143

143144
void pidfs_remove_pid(struct pid *pid)
144145
{
145-
write_seqcount_begin(&pidmap_lock_seq);
146-
rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
147-
write_seqcount_end(&pidmap_lock_seq);
146+
rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
147+
pidfs_ino_ht_params);
148148
}
149149

150150
void pidfs_free_pid(struct pid *pid)
@@ -773,42 +773,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
773773
return FILEID_KERNFS;
774774
}
775775

776-
static int pidfs_ino_find(const void *key, const struct rb_node *node)
777-
{
778-
const u64 pid_ino = *(u64 *)key;
779-
const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
780-
781-
if (pid_ino < pid->ino)
782-
return -1;
783-
if (pid_ino > pid->ino)
784-
return 1;
785-
return 0;
786-
}
787-
788776
/* Find a struct pid based on the inode number. */
789777
static struct pid *pidfs_ino_get_pid(u64 ino)
790778
{
791779
struct pid *pid;
792-
struct rb_node *node;
793-
unsigned int seq;
780+
struct pidfs_attr *attr;
794781

795782
guard(rcu)();
796-
do {
797-
seq = read_seqcount_begin(&pidmap_lock_seq);
798-
node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
799-
if (node)
800-
break;
801-
} while (read_seqcount_retry(&pidmap_lock_seq, seq));
802-
803-
if (!node)
783+
pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
784+
if (!pid)
785+
return NULL;
786+
attr = READ_ONCE(pid->attr);
787+
if (IS_ERR_OR_NULL(attr))
788+
return NULL;
789+
if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
804790
return NULL;
805-
806-
pid = rb_entry(node, struct pid, pidfs_node);
807-
808791
/* Within our pid namespace hierarchy? */
809792
if (pid_vnr(pid) == 0)
810793
return NULL;
811-
812794
return get_pid(pid);
813795
}
814796

@@ -1086,6 +1068,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
10861068

10871069
void __init pidfs_init(void)
10881070
{
1071+
if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
1072+
panic("Failed to initialize pidfs hashtable");
1073+
10891074
pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
10901075
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
10911076
SLAB_ACCOUNT | SLAB_PANIC), NULL);

include/linux/pid.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/rculist.h>
77
#include <linux/rcupdate.h>
88
#include <linux/refcount.h>
9+
#include <linux/rhashtable-types.h>
910
#include <linux/sched.h>
1011
#include <linux/wait.h>
1112

@@ -60,7 +61,7 @@ struct pid {
6061
spinlock_t lock;
6162
struct {
6263
u64 ino;
63-
struct rb_node pidfs_node;
64+
struct rhash_head pidfs_hash;
6465
struct dentry *stashed;
6566
struct pidfs_attr *attr;
6667
};
@@ -73,7 +74,6 @@ struct pid {
7374
struct upid numbers[];
7475
};
7576

76-
extern seqcount_spinlock_t pidmap_lock_seq;
7777
extern struct pid init_struct_pid;
7878

7979
struct file;

include/linux/pidfs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ struct coredump_params;
66

77
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
88
void __init pidfs_init(void);
9-
void pidfs_add_pid(struct pid *pid);
9+
void pidfs_prepare_pid(struct pid *pid);
10+
int pidfs_add_pid(struct pid *pid);
1011
void pidfs_remove_pid(struct pid *pid);
1112
void pidfs_exit(struct task_struct *tsk);
1213
#ifdef CONFIG_COREDUMP

kernel/pid.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
#include <linux/sched/task.h>
4444
#include <linux/idr.h>
4545
#include <linux/pidfs.h>
46-
#include <linux/seqlock.h>
4746
#include <net/sock.h>
4847
#include <uapi/linux/pidfd.h>
4948

@@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
8584
EXPORT_SYMBOL_GPL(init_pid_ns);
8685

8786
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
88-
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
8987

9088
void put_pid(struct pid *pid)
9189
{
@@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
141139

142140
idr_remove(&ns->idr, upid->nr);
143141
}
144-
pidfs_remove_pid(pid);
145142
spin_unlock(&pidmap_lock);
146143

144+
pidfs_remove_pid(pid);
147145
call_rcu(&pid->rcu, delayed_put_pid);
148146
}
149147

@@ -316,7 +314,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
316314
retval = -ENOMEM;
317315
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
318316
goto out_free;
319-
pidfs_add_pid(pid);
317+
pidfs_prepare_pid(pid);
318+
320319
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
321320
/* Make the PID visible to find_pid_ns. */
322321
idr_replace(&upid->ns->idr, pid, upid->nr);
@@ -326,6 +325,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
326325
idr_preload_end();
327326
ns_ref_active_get(ns);
328327

328+
retval = pidfs_add_pid(pid);
329+
if (unlikely(retval)) {
330+
free_pid(pid);
331+
pid = ERR_PTR(-ENOMEM);
332+
}
333+
329334
return pid;
330335

331336
out_free:

0 commit comments

Comments
 (0)