Skip to content

Commit ed82f35

Browse files
committed
io_uring: allow registration of per-task restrictions
Currently io_uring supports restricting operations on a per-ring basis. To use those, the ring must be setup in a disabled state by setting IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and the ring can then be enabled. This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd == -1, like the other "blind" register opcodes which work on the task rather than a specific ring. This allows registration of the same kind of restrictions as can been done on a specific ring, but with the task itself. Once done, any ring created will inherit these restrictions. If a restriction filter is registered with a task, then it's inherited on fork for its children. Children may only further restrict operations, not extend them. Inheriting restrictions include both the classic IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF filters that have been registered with the task via IORING_REGISTER_BPF_FILTER. Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 9fd9978 commit ed82f35

File tree

8 files changed

+231
-1
lines changed

8 files changed

+231
-1
lines changed

include/linux/io_uring_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ struct io_restriction {
231231
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
232232
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
233233
struct io_bpf_filters *bpf_filters;
234+
/* ->bpf_filters needs COW on modification */
235+
bool bpf_filters_cow;
234236
u8 sqe_flags_allowed;
235237
u8 sqe_flags_required;
236238
/* IORING_OP_* restrictions exist */

include/uapi/linux/io_uring.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,13 @@ struct io_uring_restriction {
808808
__u32 resv2[3];
809809
};
810810

811+
struct io_uring_task_restriction {
812+
__u16 flags;
813+
__u16 nr_res;
814+
__u32 resv[3];
815+
__DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
816+
};
817+
811818
struct io_uring_clock_register {
812819
__u32 clockid;
813820
__u32 __resv[3];

io_uring/bpf_filter.c

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter,
249249
return 0;
250250
}
251251

252+
void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
253+
{
254+
if (!src->bpf_filters)
255+
return;
256+
257+
rcu_read_lock();
258+
/*
259+
* If the src filter is going away, just ignore it.
260+
*/
261+
if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
262+
dst->bpf_filters = src->bpf_filters;
263+
dst->bpf_filters_cow = true;
264+
}
265+
rcu_read_unlock();
266+
}
267+
268+
/*
269+
* Allocate a new struct io_bpf_filters. Used when a filter is cloned and
270+
* modifications need to be made.
271+
*/
272+
static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
273+
{
274+
struct io_bpf_filters *filters;
275+
struct io_bpf_filter *srcf;
276+
int i;
277+
278+
filters = io_new_bpf_filters();
279+
if (IS_ERR(filters))
280+
return filters;
281+
282+
/*
283+
* Iterate filters from src and assign in destination. Grabbing
284+
* a reference is enough, we don't need to duplicate the memory.
285+
* This is safe because filters are only ever appended to the
286+
* front of the list, hence the only memory ever touched inside
287+
* a filter is the refcount.
288+
*/
289+
rcu_read_lock();
290+
for (i = 0; i < IORING_OP_LAST; i++) {
291+
srcf = rcu_dereference(src->bpf_filters->filters[i]);
292+
if (!srcf) {
293+
continue;
294+
} else if (srcf == &dummy_filter) {
295+
rcu_assign_pointer(filters->filters[i], &dummy_filter);
296+
continue;
297+
}
298+
299+
/*
300+
* Getting a ref on the first node is enough, putting the
301+
* filter and iterating nodes to free will stop on the first
302+
* one that doesn't hit zero when dropping.
303+
*/
304+
if (!refcount_inc_not_zero(&srcf->refs))
305+
goto err;
306+
rcu_assign_pointer(filters->filters[i], srcf);
307+
}
308+
rcu_read_unlock();
309+
return filters;
310+
err:
311+
rcu_read_unlock();
312+
__io_put_bpf_filters(filters);
313+
return ERR_PTR(-EBUSY);
314+
}
315+
252316
#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST
253317

254318
int io_register_bpf_filter(struct io_restriction *res,
255319
struct io_uring_bpf __user *arg)
256320
{
321+
struct io_bpf_filters *filters, *old_filters = NULL;
257322
struct io_bpf_filter *filter, *old_filter;
258-
struct io_bpf_filters *filters;
259323
struct io_uring_bpf reg;
260324
struct bpf_prog *prog;
261325
struct sock_fprog fprog;
@@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res,
297361
ret = PTR_ERR(filters);
298362
goto err_prog;
299363
}
364+
} else if (res->bpf_filters_cow) {
365+
filters = io_bpf_filter_cow(res);
366+
if (IS_ERR(filters)) {
367+
ret = PTR_ERR(filters);
368+
goto err_prog;
369+
}
370+
/*
371+
* Stash old filters, we'll put them once we know we'll
372+
* succeed. Until then, res->bpf_filters is left untouched.
373+
*/
374+
old_filters = res->bpf_filters;
300375
}
301376

302377
filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
@@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res,
306381
}
307382
refcount_set(&filter->refs, 1);
308383
filter->prog = prog;
384+
385+
/*
386+
* Success - install the new filter set now. If we did COW, put
387+
* the old filters as we're replacing them.
388+
*/
389+
if (old_filters) {
390+
__io_put_bpf_filters(old_filters);
391+
res->bpf_filters_cow = false;
392+
}
309393
res->bpf_filters = filters;
310394

311395
/*

io_uring/bpf_filter.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res,
1313

1414
void io_put_bpf_filters(struct io_restriction *res);
1515

16+
void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);
17+
1618
static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
1719
struct io_kiocb *req)
1820
{
@@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
3739
static inline void io_put_bpf_filters(struct io_restriction *res)
3840
{
3941
}
42+
static inline void io_bpf_filter_clone(struct io_restriction *dst,
43+
struct io_restriction *src)
44+
{
45+
}
4046
#endif /* CONFIG_IO_URING_BPF */
4147

4248
#endif

io_uring/io_uring.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config)
28802880
return 0;
28812881
}
28822882

2883+
void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
2884+
{
2885+
memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
2886+
memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
2887+
dst->sqe_flags_allowed = src->sqe_flags_allowed;
2888+
dst->sqe_flags_required = src->sqe_flags_required;
2889+
dst->op_registered = src->op_registered;
2890+
dst->reg_registered = src->reg_registered;
2891+
2892+
io_bpf_filter_clone(dst, src);
2893+
}
2894+
2895+
static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
2896+
struct io_restriction *src)
2897+
{
2898+
struct io_restriction *dst = &ctx->restrictions;
2899+
2900+
io_restriction_clone(dst, src);
2901+
if (dst->bpf_filters)
2902+
WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
2903+
if (dst->op_registered)
2904+
ctx->op_restricted = 1;
2905+
if (dst->reg_registered)
2906+
ctx->reg_restricted = 1;
2907+
}
2908+
28832909
static __cold int io_uring_create(struct io_ctx_config *config)
28842910
{
28852911
struct io_uring_params *p = &config->p;
@@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
29402966
else
29412967
ctx->notify_method = TWA_SIGNAL;
29422968

2969+
/*
2970+
* If the current task has restrictions enabled, then copy them to
2971+
* our newly created ring and mark it as registered.
2972+
*/
2973+
if (current->io_uring_restrict)
2974+
io_ctx_restriction_clone(ctx, current->io_uring_restrict);
2975+
29432976
/*
29442977
* This is just grabbed for accounting purposes. When a process exits,
29452978
* the mm is exited and dropped before the files, hence we need to hang

io_uring/io_uring.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ void io_task_refs_refill(struct io_uring_task *tctx);
197197
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
198198

199199
void io_activate_pollwq(struct io_ring_ctx *ctx);
200+
void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
200201

201202
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
202203
{

io_uring/register.c

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,82 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
190190
return 0;
191191
}
192192

193+
static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
194+
{
195+
struct io_uring_task_restriction __user *ures = arg;
196+
struct io_uring_task_restriction tres;
197+
struct io_restriction *res;
198+
int ret;
199+
200+
/* Disallow if task already has registered restrictions */
201+
if (current->io_uring_restrict)
202+
return -EPERM;
203+
/*
204+
* Similar to seccomp, disallow setting a filter if task_no_new_privs
205+
* is true and we're not CAP_SYS_ADMIN.
206+
*/
207+
if (!task_no_new_privs(current) &&
208+
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
209+
return -EACCES;
210+
if (nr_args != 1)
211+
return -EINVAL;
212+
213+
if (copy_from_user(&tres, arg, sizeof(tres)))
214+
return -EFAULT;
215+
216+
if (tres.flags)
217+
return -EINVAL;
218+
if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
219+
return -EINVAL;
220+
221+
res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
222+
if (!res)
223+
return -ENOMEM;
224+
225+
ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
226+
if (ret < 0) {
227+
kfree(res);
228+
return ret;
229+
}
230+
current->io_uring_restrict = res;
231+
return 0;
232+
}
233+
234+
static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
235+
{
236+
struct io_restriction *res;
237+
int ret;
238+
239+
/*
240+
* Similar to seccomp, disallow setting a filter if task_no_new_privs
241+
* is true and we're not CAP_SYS_ADMIN.
242+
*/
243+
if (!task_no_new_privs(current) &&
244+
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
245+
return -EACCES;
246+
247+
if (nr_args != 1)
248+
return -EINVAL;
249+
250+
/* If no task restrictions exist, setup a new set */
251+
res = current->io_uring_restrict;
252+
if (!res) {
253+
res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
254+
if (!res)
255+
return -ENOMEM;
256+
}
257+
258+
ret = io_register_bpf_filter(res, arg);
259+
if (ret) {
260+
if (res != current->io_uring_restrict)
261+
kfree(res);
262+
return ret;
263+
}
264+
if (!current->io_uring_restrict)
265+
current->io_uring_restrict = res;
266+
return 0;
267+
}
268+
193269
static int io_register_enable_rings(struct io_ring_ctx *ctx)
194270
{
195271
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
@@ -912,6 +988,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
912988
return io_uring_register_send_msg_ring(arg, nr_args);
913989
case IORING_REGISTER_QUERY:
914990
return io_query(arg, nr_args);
991+
case IORING_REGISTER_RESTRICTIONS:
992+
return io_register_restrictions_task(arg, nr_args);
993+
case IORING_REGISTER_BPF_FILTER:
994+
return io_register_bpf_filter_task(arg, nr_args);
915995
}
916996
return -EINVAL;
917997
}

io_uring/tctx.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "io_uring.h"
1313
#include "tctx.h"
14+
#include "bpf_filter.h"
1415

1516
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
1617
struct task_struct *task)
@@ -66,6 +67,11 @@ void __io_uring_free(struct task_struct *tsk)
6667
kfree(tctx);
6768
tsk->io_uring = NULL;
6869
}
70+
if (tsk->io_uring_restrict) {
71+
io_put_bpf_filters(tsk->io_uring_restrict);
72+
kfree(tsk->io_uring_restrict);
73+
tsk->io_uring_restrict = NULL;
74+
}
6975
}
7076

7177
__cold int io_uring_alloc_task_context(struct task_struct *task,
@@ -356,5 +362,16 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
356362

357363
int __io_uring_fork(struct task_struct *tsk)
358364
{
365+
struct io_restriction *res, *src = tsk->io_uring_restrict;
366+
367+
/* Don't leave it dangling on error */
368+
tsk->io_uring_restrict = NULL;
369+
370+
res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
371+
if (!res)
372+
return -ENOMEM;
373+
374+
tsk->io_uring_restrict = res;
375+
io_restriction_clone(res, src);
359376
return 0;
360377
}

0 commit comments

Comments
 (0)