Skip to content

Commit 591beb0

Browse files
committed
Merge tag 'io_uring-bpf-restrictions.4-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring bpf filters from Jens Axboe: "This adds support for both cBPF filters for io_uring, as well as task inherited restrictions and filters. seccomp and io_uring don't play along nicely, as most of the interesting data to filter on resides somewhat out-of-band, in the submission queue ring. As a result, things like containers and systemd that apply seccomp filters, can't filter io_uring operations. That leaves them with just one choice if filtering is critical - filter the actual io_uring_setup(2) system call to simply disallow io_uring. That's rather unfortunate, and has limited us because of it. io_uring already has some filtering support. It requires the ring to be setup in a disabled state, and then a filter set can be applied. This filter set is completely bi-modal - an opcode is either enabled or it's not. Once a filter set is registered, the ring can be enabled. This is very restrictive, and it's not useful at all to systemd or containers which really want both broader and more specific control. This first adds support for cBPF filters for opcodes, which enables tighter control over what exactly a specific opcode may do. As examples, specific support is added for IORING_OP_OPENAT/OPENAT2, allowing filtering on resolve flags. And another example is added for IORING_OP_SOCKET, allowing filtering on domain/type/protocol. These are both common use cases. cBPF was chosen rather than eBPF, because the latter is often restricted in containers as well. These filters are run post the init phase of the request, which allows filters to even dip into data that is being passed in struct in user memory, as the init side of requests make that data stable by bringing it into the kernel. This allows filtering without needing to copy this data twice, or have filters etc know about the exact layout of the user data. The filters get the already copied and sanitized data passed. On top of that support is added for per-task filters, meaning that any ring created with a task that has a per-task filter will get those filters applied when it's created. These filters are inherited across fork as well. Once a filter has been registered, any further added filters may only further restrict what operations are permitted. Filters cannot change the return value of an operation, they can only permit or deny it based on the contents" * tag 'io_uring-bpf-restrictions.4-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring: allow registration of per-task restrictions io_uring: add task fork hook io_uring/bpf_filter: add ref counts to struct io_bpf_filter io_uring/bpf_filter: cache lookup table in ctx->bpf_filters io_uring/bpf_filter: allow filtering on contents of struct open_how io_uring/net: allow filtering on IORING_OP_SOCKET data io_uring: add support for BPF filtering for opcode restrictions
2 parents f5d4fee + ed82f35 commit 591beb0

File tree

18 files changed

+789
-10
lines changed

18 files changed

+789
-10
lines changed

include/linux/io_uring.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ void __io_uring_free(struct task_struct *tsk);
1212
void io_uring_unreg_ringfd(void);
1313
const char *io_uring_get_opcode(u8 opcode);
1414
bool io_is_uring_fops(struct file *file);
15+
int __io_uring_fork(struct task_struct *tsk);
1516

1617
static inline void io_uring_files_cancel(void)
1718
{
@@ -25,9 +26,16 @@ static inline void io_uring_task_cancel(void)
2526
}
2627
static inline void io_uring_free(struct task_struct *tsk)
2728
{
28-
if (tsk->io_uring)
29+
if (tsk->io_uring || tsk->io_uring_restrict)
2930
__io_uring_free(tsk);
3031
}
32+
static inline int io_uring_fork(struct task_struct *tsk)
33+
{
34+
if (tsk->io_uring_restrict)
35+
return __io_uring_fork(tsk);
36+
37+
return 0;
38+
}
3139
#else
3240
static inline void io_uring_task_cancel(void)
3341
{
@@ -46,6 +54,10 @@ static inline bool io_is_uring_fops(struct file *file)
4654
{
4755
return false;
4856
}
57+
static inline int io_uring_fork(struct task_struct *tsk)
58+
{
59+
return 0;
60+
}
4961
#endif
5062

5163
#endif

include/linux/io_uring_types.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,20 @@ struct io_rings {
219219
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
220220
};
221221

222+
struct io_bpf_filter;
223+
struct io_bpf_filters {
224+
refcount_t refs; /* ref for ->bpf_filters */
225+
spinlock_t lock; /* protects ->bpf_filters modifications */
226+
struct io_bpf_filter __rcu **filters;
227+
struct rcu_head rcu_head;
228+
};
229+
222230
struct io_restriction {
223231
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
224232
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
233+
struct io_bpf_filters *bpf_filters;
234+
/* ->bpf_filters needs COW on modification */
235+
bool bpf_filters_cow;
225236
u8 sqe_flags_allowed;
226237
u8 sqe_flags_required;
227238
/* IORING_OP_* restrictions exist */
@@ -278,6 +289,8 @@ struct io_ring_ctx {
278289

279290
struct task_struct *submitter_task;
280291
struct io_rings *rings;
292+
/* cache of ->restrictions.bpf_filters->filters */
293+
struct io_bpf_filter __rcu **bpf_filters;
281294
struct percpu_ref refs;
282295

283296
clockid_t clockid;

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,6 +1186,7 @@ struct task_struct {
11861186

11871187
#ifdef CONFIG_IO_URING
11881188
struct io_uring_task *io_uring;
1189+
struct io_restriction *io_uring_restrict;
11891190
#endif
11901191

11911192
/* Namespaces: */

include/uapi/linux/io_uring.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,9 @@ enum io_uring_register_op {
712712
/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
713713
IORING_REGISTER_ZCRX_CTRL = 36,
714714

715+
/* register bpf filtering programs */
716+
IORING_REGISTER_BPF_FILTER = 37,
717+
715718
/* this goes last */
716719
IORING_REGISTER_LAST,
717720

@@ -817,6 +820,13 @@ struct io_uring_restriction {
817820
__u32 resv2[3];
818821
};
819822

823+
struct io_uring_task_restriction {
824+
__u16 flags;
825+
__u16 nr_res;
826+
__u32 resv[3];
827+
__DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
828+
};
829+
820830
struct io_uring_clock_register {
821831
__u32 clockid;
822832
__u32 __resv[3];
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
2+
/*
3+
* Header file for the io_uring BPF filters.
4+
*/
5+
#ifndef LINUX_IO_URING_BPF_FILTER_H
6+
#define LINUX_IO_URING_BPF_FILTER_H
7+
8+
#include <linux/types.h>
9+
10+
/*
11+
* Struct passed to filters.
12+
*/
13+
struct io_uring_bpf_ctx {
14+
__u64 user_data;
15+
__u8 opcode;
16+
__u8 sqe_flags;
17+
__u8 pdu_size; /* size of aux data for filter */
18+
__u8 pad[5];
19+
union {
20+
struct {
21+
__u32 family;
22+
__u32 type;
23+
__u32 protocol;
24+
} socket;
25+
struct {
26+
__u64 flags;
27+
__u64 mode;
28+
__u64 resolve;
29+
} open;
30+
};
31+
};
32+
33+
enum {
34+
/*
35+
* If set, any currently unset opcode will have a deny filter attached
36+
*/
37+
IO_URING_BPF_FILTER_DENY_REST = 1,
38+
};
39+
40+
struct io_uring_bpf_filter {
41+
__u32 opcode; /* io_uring opcode to filter */
42+
__u32 flags;
43+
__u32 filter_len; /* number of BPF instructions */
44+
__u32 resv;
45+
__u64 filter_ptr; /* pointer to BPF filter */
46+
__u64 resv2[5];
47+
};
48+
49+
enum {
50+
IO_URING_BPF_CMD_FILTER = 1,
51+
};
52+
53+
struct io_uring_bpf {
54+
__u16 cmd_type; /* IO_URING_BPF_* values */
55+
__u16 cmd_flags; /* none so far */
56+
__u32 resv;
57+
union {
58+
struct io_uring_bpf_filter filter;
59+
};
60+
};
61+
62+
#endif

io_uring/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,8 @@ config IO_URING_ZCRX
99
depends on PAGE_POOL
1010
depends on INET
1111
depends on NET_RX_BUSY_POLL
12+
13+
config IO_URING_BPF
14+
def_bool y
15+
depends on BPF
16+
depends on NET

io_uring/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
2424
obj-$(CONFIG_NET) += net.o cmd_net.o
2525
obj-$(CONFIG_PROC_FS) += fdinfo.o
2626
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
27+
obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o

0 commit comments

Comments
 (0)