Skip to content

Commit f5d4fee

Browse files
committed
Merge tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Clean up the IORING_SETUP_R_DISABLED and submitter task checking, mostly just in preparation for relaxing the locking for SINGLE_ISSUER in the future. - Improve IOPOLL by using a doubly linked list to manage completions. Previously it was singly listed, which meant that to complete request N in the chain 0..N-1 had to have completed first. With a doubly linked list we can complete whatever request completes in that order, rather than need to wait for a consecutive range to be available. This reduces latencies. - Improve the restriction setup and checking. Mostly in preparation for adding further features on top of that. Coming in a separate pull request. - Split out task_work and wait handling into separate files. These are mostly nicely abstracted already, but still remained in the io_uring.c file which is on the larger side. - Use GFP_KERNEL_ACCOUNT in a few more spots, where appropriate. - Ensure even the idle io-wq worker exits if a task no longer has any rings open. - Add support for a non-circular submission queue. By default, the SQ ring keeps moving around, even if only a few entries are used for each submission. This can be wasteful in terms of cachelines. If IORING_SETUP_SQ_REWIND is set for the ring when created, each submission will start at offset 0 instead of where we last left off doing submissions. - Various little cleanups * tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (30 commits) io_uring/kbuf: fix memory leak if io_buffer_add_list fails io_uring: Add SPDX id lines to remaining source files io_uring: allow io-wq workers to exit when unused io_uring/io-wq: add exit-on-idle state io_uring/net: don't continue send bundle if poll was required for retry io_uring/rsrc: use GFP_KERNEL_ACCOUNT consistently io_uring/futex: use GFP_KERNEL_ACCOUNT for futex data allocation io_uring/io-wq: handle !sysctl_hung_task_timeout_secs io_uring: fix bad indentation for setup flags if statement io_uring/rsrc: take unsigned index in io_rsrc_node_lookup() io_uring: introduce non-circular SQ io_uring: split out CQ waiting code into wait.c io_uring: split out task work code into tw.c io_uring/io-wq: don't trigger hung task for syzbot craziness io_uring: add IO_URING_EXIT_WAIT_MAX definition io_uring/sync: validate passed in offset io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member io_uring/timeout: annotate data race in io_flush_timeouts() io_uring/uring_cmd: explicitly disallow cancelations for IOPOLL io_uring: fix IOPOLL with passthrough I/O ...
2 parents 26c9342 + 442ae40 commit f5d4fee

35 files changed

+1074
-915
lines changed

include/linux/io_uring_types.h

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,10 @@ struct io_restriction {
224224
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
225225
u8 sqe_flags_allowed;
226226
u8 sqe_flags_required;
227-
bool registered;
227+
/* IORING_OP_* restrictions exist */
228+
bool op_registered;
229+
/* IORING_REGISTER_* restrictions exist */
230+
bool reg_registered;
228231
};
229232

230233
struct io_submit_link {
@@ -259,7 +262,8 @@ struct io_ring_ctx {
259262
struct {
260263
unsigned int flags;
261264
unsigned int drain_next: 1;
262-
unsigned int restricted: 1;
265+
unsigned int op_restricted: 1;
266+
unsigned int reg_restricted: 1;
263267
unsigned int off_timeout_used: 1;
264268
unsigned int drain_active: 1;
265269
unsigned int has_evfd: 1;
@@ -316,7 +320,7 @@ struct io_ring_ctx {
316320
* manipulate the list, hence no extra locking is needed there.
317321
*/
318322
bool poll_multi_queue;
319-
struct io_wq_work_list iopoll_list;
323+
struct list_head iopoll_list;
320324

321325
struct io_file_table file_table;
322326
struct io_rsrc_data buf_table;
@@ -444,6 +448,9 @@ struct io_ring_ctx {
444448
struct list_head defer_list;
445449
unsigned nr_drained;
446450

451+
/* protected by ->completion_lock */
452+
unsigned nr_req_allocated;
453+
447454
#ifdef CONFIG_NET_RX_BUSY_POLL
448455
struct list_head napi_list; /* track busy poll napi_id */
449456
spinlock_t napi_lock; /* napi_list lock */
@@ -456,10 +463,6 @@ struct io_ring_ctx {
456463
DECLARE_HASHTABLE(napi_ht, 4);
457464
#endif
458465

459-
/* protected by ->completion_lock */
460-
unsigned evfd_last_cq_tail;
461-
unsigned nr_req_allocated;
462-
463466
/*
464467
* Protection for resize vs mmap races - both the mmap and resize
465468
* side will need to grab this lock, to prevent either side from
@@ -714,15 +717,21 @@ struct io_kiocb {
714717

715718
atomic_t refs;
716719
bool cancel_seq_set;
717-
struct io_task_work io_task_work;
720+
721+
union {
722+
struct io_task_work io_task_work;
723+
/* For IOPOLL setup queues, with hybrid polling */
724+
u64 iopoll_start;
725+
};
726+
718727
union {
719728
/*
720729
* for polled requests, i.e. IORING_OP_POLL_ADD and async armed
721730
* poll
722731
*/
723732
struct hlist_node hash_node;
724-
/* For IOPOLL setup queues, with hybrid polling */
725-
u64 iopoll_start;
733+
/* IOPOLL completion handling */
734+
struct list_head iopoll_node;
726735
/* for private io_kiocb freeing */
727736
struct rcu_head rcu_head;
728737
};

include/uapi/linux/io_uring.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit {
237237
*/
238238
#define IORING_SETUP_SQE_MIXED (1U << 19)
239239

240+
/*
241+
* When set, io_uring ignores SQ head and tail and fetches SQEs to submit
242+
* starting from index 0 instead from the index stored in the head pointer.
243+
* IOW, the user should place all SQE at the beginning of the SQ memory
244+
* before issuing a submission syscall.
245+
*
246+
* It requires IORING_SETUP_NO_SQARRAY and is incompatible with
247+
* IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail
248+
* values and keep it set to 0. Any other value is undefined behaviour.
249+
*/
250+
#define IORING_SETUP_SQ_REWIND (1U << 20)
251+
240252
enum io_uring_op {
241253
IORING_OP_NOP,
242254
IORING_OP_READV,

io_uring/Makefile

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@ endif
88

99
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
1010
tctx.o filetable.o rw.o poll.o \
11-
eventfd.o uring_cmd.o openclose.o \
12-
sqpoll.o xattr.o nop.o fs.o splice.o \
13-
sync.o msg_ring.o advise.o openclose.o \
14-
statx.o timeout.o cancel.o \
15-
waitid.o register.o truncate.o \
16-
memmap.o alloc_cache.o query.o
11+
tw.o wait.o eventfd.o uring_cmd.o \
12+
openclose.o sqpoll.o xattr.o nop.o \
13+
fs.o splice.o sync.o msg_ring.o \
14+
advise.o openclose.o statx.o timeout.o \
15+
cancel.o waitid.o register.o \
16+
truncate.o memmap.o alloc_cache.o \
17+
query.o
18+
1719
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
1820
obj-$(CONFIG_IO_WQ) += io-wq.o
1921
obj-$(CONFIG_FUTEX) += futex.o

io_uring/alloc_cache.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
12
#ifndef IOU_ALLOC_CACHE_H
23
#define IOU_ALLOC_CACHE_H
34

45
#include <linux/io_uring_types.h>
6+
#include <linux/kasan.h>
57

68
/*
79
* Don't allow the cache to grow beyond this size.

io_uring/cancel.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22
#include <linux/kernel.h>
33
#include <linux/errno.h>
44
#include <linux/fs.h>
5-
#include <linux/file.h>
65
#include <linux/mm.h>
76
#include <linux/slab.h>
8-
#include <linux/namei.h>
97
#include <linux/nospec.h>
108
#include <linux/io_uring.h>
119

@@ -21,6 +19,7 @@
2119
#include "waitid.h"
2220
#include "futex.h"
2321
#include "cancel.h"
22+
#include "wait.h"
2423

2524
struct io_cancel {
2625
struct file *file;
@@ -539,7 +538,7 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
539538
/* SQPOLL thread does its own polling */
540539
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
541540
is_sqpoll_thread) {
542-
while (!wq_list_empty(&ctx->iopoll_list)) {
541+
while (!list_empty(&ctx->iopoll_list)) {
543542
io_iopoll_try_reap_events(ctx);
544543
ret = true;
545544
cond_resched();

io_uring/cmd_net.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// SPDX-License-Identifier: GPL-2.0
12
#include <asm/ioctls.h>
23
#include <linux/io_uring/net.h>
34
#include <linux/errqueue.h>

io_uring/eventfd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
12

23
struct io_ring_ctx;
34
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

io_uring/filetable.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#ifndef IOU_FILE_TABLE_H
33
#define IOU_FILE_TABLE_H
44

5-
#include <linux/file.h>
65
#include <linux/io_uring_types.h>
76
#include "rsrc.h"
87

io_uring/futex.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
186186
return -EINVAL;
187187

188188
ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr),
189-
GFP_KERNEL);
189+
GFP_KERNEL_ACCOUNT);
190190
if (!ifd)
191191
return -ENOMEM;
192192

io_uring/io-wq.c

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/task_work.h>
1818
#include <linux/audit.h>
1919
#include <linux/mmu_context.h>
20+
#include <linux/sched/sysctl.h>
2021
#include <uapi/linux/io_uring.h>
2122

2223
#include "io-wq.h"
@@ -34,6 +35,7 @@ enum {
3435

3536
enum {
3637
IO_WQ_BIT_EXIT = 0, /* wq exiting */
38+
IO_WQ_BIT_EXIT_ON_IDLE = 1, /* allow all workers to exit on idle */
3739
};
3840

3941
enum {
@@ -706,9 +708,13 @@ static int io_wq_worker(void *data)
706708
raw_spin_lock(&acct->workers_lock);
707709
/*
708710
* Last sleep timed out. Exit if we're not the last worker,
709-
* or if someone modified our affinity.
711+
* or if someone modified our affinity. If wq is marked
712+
* idle-exit, drop the worker as well. This is used to avoid
713+
* keeping io-wq workers around for tasks that no longer have
714+
* any active io_uring instances.
710715
*/
711-
if (last_timeout && (exit_mask || acct->nr_workers > 1)) {
716+
if ((last_timeout && (exit_mask || acct->nr_workers > 1)) ||
717+
test_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) {
712718
acct->nr_workers--;
713719
raw_spin_unlock(&acct->workers_lock);
714720
__set_current_state(TASK_RUNNING);
@@ -963,6 +969,24 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
963969
return false;
964970
}
965971

972+
void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable)
973+
{
974+
if (!wq->task)
975+
return;
976+
977+
if (!enable) {
978+
clear_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state);
979+
return;
980+
}
981+
982+
if (test_and_set_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state))
983+
return;
984+
985+
rcu_read_lock();
986+
io_wq_for_each_worker(wq, io_wq_worker_wake, NULL);
987+
rcu_read_unlock();
988+
}
989+
966990
static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
967991
{
968992
do {
@@ -1313,6 +1337,8 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
13131337

13141338
static void io_wq_exit_workers(struct io_wq *wq)
13151339
{
1340+
unsigned long timeout, warn_timeout;
1341+
13161342
if (!wq->task)
13171343
return;
13181344

@@ -1322,7 +1348,26 @@ static void io_wq_exit_workers(struct io_wq *wq)
13221348
io_wq_for_each_worker(wq, io_wq_worker_wake, NULL);
13231349
rcu_read_unlock();
13241350
io_worker_ref_put(wq);
1325-
wait_for_completion(&wq->worker_done);
1351+
1352+
/*
1353+
* Shut up hung task complaint, see for example
1354+
*
1355+
* https://lore.kernel.org/all/696fc9e7.a70a0220.111c58.0006.GAE@google.com/
1356+
*
1357+
* where completely overloading the system with tons of long running
1358+
* io-wq items can easily trigger the hung task timeout. Only sleep
1359+
* uninterruptibly for half that time, and warn if we exceeded end
1360+
* up waiting more than IO_URING_EXIT_WAIT_MAX.
1361+
*/
1362+
timeout = sysctl_hung_task_timeout_secs * HZ / 2;
1363+
if (!timeout)
1364+
timeout = MAX_SCHEDULE_TIMEOUT;
1365+
warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX;
1366+
do {
1367+
if (wait_for_completion_timeout(&wq->worker_done, timeout))
1368+
break;
1369+
WARN_ON_ONCE(time_after(jiffies, warn_timeout));
1370+
} while (1);
13261371

13271372
spin_lock_irq(&wq->hash->wait.lock);
13281373
list_del_init(&wq->wait.entry);

0 commit comments

Comments
 (0)