Skip to content

Commit 6a8dab0

Browse files
committed
Merge tag 'sched_ext-for-7.0-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - Fix starvation of scx_enable() under fair-class saturation by offloading the enable path to an RT kthread - Fix out-of-bounds access in idle mask initialization on systems with non-contiguous NUMA node IDs - Fix a preemption window during scheduler exit and a refcount underflow in cgroup init error path - Fix SCX_EFLAG_INITIALIZED being a no-op flag - Add READ_ONCE() annotations for KCSAN-clean lockless accesses and replace naked scx_root dereferences with container_of() in kobject callbacks - Tooling and selftest fixes: compilation issues with clang 17, strtoul() misuse, unused options cleanup, and Kconfig sync * tag 'sched_ext-for-7.0-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: Fix starvation of scx_enable() under fair-class saturation sched_ext: Remove redundant css_put() in scx_cgroup_init() selftests/sched_ext: Fix peek_dsq.bpf.c compile error for clang 17 selftests/sched_ext: Add -fms-extensions to bpf build flags tools/sched_ext: Add -fms-extensions to bpf build flags sched_ext: Use READ_ONCE() for plain reads of scx_watchdog_timeout sched_ext: Replace naked scx_root dereferences in kobject callbacks sched_ext: Use READ_ONCE() for the read side of dsq->nr update tools/sched_ext: fix strtoul() misuse in scx_hotplug_seq() sched_ext: Fix SCX_EFLAG_INITIALIZED being a no-op flag sched_ext: Fix out-of-bounds access in scx_idle_init_masks() sched_ext: Disable preemption between scx_claim_exit() and kicking helper work tools/sched_ext: Add Kconfig to sync with upstream tools/sched_ext: Sync README.md Kconfig with upstream scx selftests/sched_ext: Remove duplicated unistd.h include in rt_stall.c tools/sched_ext: scx_sdt: Remove unused '-f' option tools/sched_ext: scx_central: Remove unused '-p' option selftests/sched_ext: Fix unused-result warning for read() selftests/sched_ext: Abort test loop on signal
2 parents c44db6c + b06ccba commit 6a8dab0

File tree

14 files changed

+165
-36
lines changed

14 files changed

+165
-36
lines changed

kernel/sched/ext.c

Lines changed: 83 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
976976

977977
static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
978978
{
979-
/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
980-
WRITE_ONCE(dsq->nr, dsq->nr + delta);
979+
/*
980+
* scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE()
981+
* on the read side and WRITE_ONCE() on the write side to properly
982+
* annotate the concurrent lockless access and avoid KCSAN warnings.
983+
*/
984+
WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta);
981985
}
982986

983987
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
@@ -2735,7 +2739,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
27352739
unsigned long last_runnable = p->scx.runnable_at;
27362740

27372741
if (unlikely(time_after(jiffies,
2738-
last_runnable + scx_watchdog_timeout))) {
2742+
last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
27392743
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
27402744

27412745
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2763,7 +2767,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
27632767
cond_resched();
27642768
}
27652769
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
2766-
scx_watchdog_timeout / 2);
2770+
READ_ONCE(scx_watchdog_timeout) / 2);
27672771
}
27682772

27692773
void scx_tick(struct rq *rq)
@@ -3585,7 +3589,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
35853589
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
35863590
css->cgroup, &args);
35873591
if (ret) {
3588-
css_put(css);
35893592
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
35903593
return ret;
35913594
}
@@ -3708,7 +3711,9 @@ static void scx_kobj_release(struct kobject *kobj)
37083711
static ssize_t scx_attr_ops_show(struct kobject *kobj,
37093712
struct kobj_attribute *ka, char *buf)
37103713
{
3711-
return sysfs_emit(buf, "%s\n", scx_root->ops.name);
3714+
struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
3715+
3716+
return sysfs_emit(buf, "%s\n", sch->ops.name);
37123717
}
37133718
SCX_ATTR(ops);
37143719

@@ -3752,7 +3757,9 @@ static const struct kobj_type scx_ktype = {
37523757

37533758
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
37543759
{
3755-
return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
3760+
const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
3761+
3762+
return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
37563763
}
37573764

37583765
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4423,10 +4430,19 @@ static void scx_disable_workfn(struct kthread_work *work)
44234430
scx_bypass(false);
44244431
}
44254432

4433+
/*
4434+
* Claim the exit on @sch. The caller must ensure that the helper kthread work
4435+
* is kicked before the current task can be preempted. Once exit_kind is
4436+
* claimed, scx_error() can no longer trigger, so if the current task gets
4437+
* preempted and the BPF scheduler fails to schedule it back, the helper work
4438+
* will never be kicked and the whole system can wedge.
4439+
*/
44264440
static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
44274441
{
44284442
int none = SCX_EXIT_NONE;
44294443

4444+
lockdep_assert_preemption_disabled();
4445+
44304446
if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
44314447
return false;
44324448

@@ -4449,6 +4465,7 @@ static void scx_disable(enum scx_exit_kind kind)
44494465
rcu_read_lock();
44504466
sch = rcu_dereference(scx_root);
44514467
if (sch) {
4468+
guard(preempt)();
44524469
scx_claim_exit(sch, kind);
44534470
kthread_queue_work(sch->helper, &sch->disable_work);
44544471
}
@@ -4771,6 +4788,8 @@ static bool scx_vexit(struct scx_sched *sch,
47714788
{
47724789
struct scx_exit_info *ei = sch->exit_info;
47734790

4791+
guard(preempt)();
4792+
47744793
if (!scx_claim_exit(sch, kind))
47754794
return false;
47764795

@@ -4955,20 +4974,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
49554974
return 0;
49564975
}
49574976

4958-
static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
4977+
/*
4978+
* scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
4979+
* starvation. During the READY -> ENABLED task switching loop, the calling
4980+
* thread's sched_class gets switched from fair to ext. As fair has higher
4981+
* priority than ext, the calling thread can be indefinitely starved under
4982+
* fair-class saturation, leading to a system hang.
4983+
*/
4984+
struct scx_enable_cmd {
4985+
struct kthread_work work;
4986+
struct sched_ext_ops *ops;
4987+
int ret;
4988+
};
4989+
4990+
static void scx_enable_workfn(struct kthread_work *work)
49594991
{
4992+
struct scx_enable_cmd *cmd =
4993+
container_of(work, struct scx_enable_cmd, work);
4994+
struct sched_ext_ops *ops = cmd->ops;
49604995
struct scx_sched *sch;
49614996
struct scx_task_iter sti;
49624997
struct task_struct *p;
49634998
unsigned long timeout;
49644999
int i, cpu, ret;
49655000

4966-
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
4967-
cpu_possible_mask)) {
4968-
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
4969-
return -EINVAL;
4970-
}
4971-
49725001
mutex_lock(&scx_enable_mutex);
49735002

49745003
if (scx_enable_state() != SCX_DISABLED) {
@@ -5060,7 +5089,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
50605089
WRITE_ONCE(scx_watchdog_timeout, timeout);
50615090
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
50625091
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
5063-
scx_watchdog_timeout / 2);
5092+
READ_ONCE(scx_watchdog_timeout) / 2);
50645093

50655094
/*
50665095
* Once __scx_enabled is set, %current can be switched to SCX anytime.
@@ -5185,13 +5214,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
51855214

51865215
atomic_long_inc(&scx_enable_seq);
51875216

5188-
return 0;
5217+
cmd->ret = 0;
5218+
return;
51895219

51905220
err_free_ksyncs:
51915221
free_kick_syncs();
51925222
err_unlock:
51935223
mutex_unlock(&scx_enable_mutex);
5194-
return ret;
5224+
cmd->ret = ret;
5225+
return;
51955226

51965227
err_disable_unlock_all:
51975228
scx_cgroup_unlock();
@@ -5210,7 +5241,41 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52105241
*/
52115242
scx_error(sch, "scx_enable() failed (%d)", ret);
52125243
kthread_flush_work(&sch->disable_work);
5213-
return 0;
5244+
cmd->ret = 0;
5245+
}
5246+
5247+
static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
5248+
{
5249+
static struct kthread_worker *helper;
5250+
static DEFINE_MUTEX(helper_mutex);
5251+
struct scx_enable_cmd cmd;
5252+
5253+
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
5254+
cpu_possible_mask)) {
5255+
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
5256+
return -EINVAL;
5257+
}
5258+
5259+
if (!READ_ONCE(helper)) {
5260+
mutex_lock(&helper_mutex);
5261+
if (!helper) {
5262+
helper = kthread_run_worker(0, "scx_enable_helper");
5263+
if (IS_ERR_OR_NULL(helper)) {
5264+
helper = NULL;
5265+
mutex_unlock(&helper_mutex);
5266+
return -ENOMEM;
5267+
}
5268+
sched_set_fifo(helper->task);
5269+
}
5270+
mutex_unlock(&helper_mutex);
5271+
}
5272+
5273+
kthread_init_work(&cmd.work, scx_enable_workfn);
5274+
cmd.ops = ops;
5275+
5276+
kthread_queue_work(READ_ONCE(helper), &cmd.work);
5277+
kthread_flush_work(&cmd.work);
5278+
return cmd.ret;
52145279
}
52155280

52165281

kernel/sched/ext_idle.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -663,9 +663,8 @@ void scx_idle_init_masks(void)
663663
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
664664
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
665665

666-
/* Allocate per-node idle cpumasks */
667-
scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks,
668-
num_possible_nodes());
666+
/* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */
667+
scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids);
669668
BUG_ON(!scx_idle_node_masks);
670669

671670
for_each_node(i) {

kernel/sched/ext_internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ enum scx_exit_flags {
7474
* info communication. The following flag indicates whether ops.init()
7575
* finished successfully.
7676
*/
77-
SCX_EFLAG_INITIALIZED,
77+
SCX_EFLAG_INITIALIZED = 1LLU << 0,
7878
};
7979

8080
/*

tools/sched_ext/Kconfig

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# sched-ext mandatory options
2+
#
3+
CONFIG_BPF=y
4+
CONFIG_BPF_SYSCALL=y
5+
CONFIG_BPF_JIT=y
6+
CONFIG_DEBUG_INFO_BTF=y
7+
CONFIG_BPF_JIT_ALWAYS_ON=y
8+
CONFIG_BPF_JIT_DEFAULT_ON=y
9+
CONFIG_SCHED_CLASS_EXT=y
10+
11+
# Required by some rust schedulers (e.g. scx_p2dq)
12+
#
13+
CONFIG_KALLSYMS_ALL=y
14+
15+
# Required on arm64
16+
#
17+
# CONFIG_DEBUG_INFO_REDUCED is not set
18+
19+
# LAVD tracks futex to give an additional time slice for futex holder
20+
# (i.e., avoiding lock holder preemption) for better system-wide progress.
21+
# LAVD first tries to use ftrace to trace futex function calls.
22+
# If that is not available, it tries to use a tracepoint.
23+
CONFIG_FUNCTION_TRACER=y
24+
25+
# Enable scheduling debugging
26+
#
27+
CONFIG_SCHED_DEBUG=y
28+
29+
# Enable extra scheduling features (for a better code coverage while testing
30+
# the schedulers)
31+
#
32+
CONFIG_SCHED_AUTOGROUP=y
33+
CONFIG_SCHED_CORE=y
34+
CONFIG_SCHED_MC=y
35+
36+
# Enable fully preemptible kernel for a better test coverage of the schedulers
37+
#
38+
# CONFIG_PREEMPT_NONE is not set
39+
# CONFIG_PREEMPT_VOLUNTARY is not set
40+
CONFIG_PREEMPT=y
41+
CONFIG_PREEMPT_DYNAMIC=y
42+
43+
# Additional debugging information (useful to catch potential locking issues)
44+
CONFIG_DEBUG_LOCKDEP=y
45+
CONFIG_DEBUG_ATOMIC_SLEEP=y
46+
CONFIG_PROVE_LOCKING=y
47+
48+
# Bpftrace headers (for additional debug info)
49+
CONFIG_BPF_EVENTS=y
50+
CONFIG_FTRACE_SYSCALLS=y
51+
CONFIG_DYNAMIC_FTRACE=y
52+
CONFIG_KPROBES=y
53+
CONFIG_KPROBE_EVENTS=y
54+
CONFIG_UPROBES=y
55+
CONFIG_UPROBE_EVENTS=y
56+
CONFIG_DEBUG_FS=y
57+
58+
# Enable access to kernel configuration and headers at runtime
59+
CONFIG_IKHEADERS=y
60+
CONFIG_IKCONFIG_PROC=y
61+
CONFIG_IKCONFIG=y

tools/sched_ext/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
122122
-I../../include \
123123
$(call get_sys_includes,$(CLANG)) \
124124
-Wall -Wno-compare-distinct-pointer-types \
125+
-Wno-microsoft-anon-tag \
126+
-fms-extensions \
125127
-O2 -mcpu=v3
126128

127129
# sort removes libbpf duplicates when not cross-building

tools/sched_ext/README.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,8 @@ CONFIG_SCHED_CLASS_EXT=y
5858
CONFIG_BPF_SYSCALL=y
5959
CONFIG_BPF_JIT=y
6060
CONFIG_DEBUG_INFO_BTF=y
61-
```
62-
63-
It's also recommended that you also include the following Kconfig options:
64-
65-
```
6661
CONFIG_BPF_JIT_ALWAYS_ON=y
6762
CONFIG_BPF_JIT_DEFAULT_ON=y
68-
CONFIG_PAHOLE_HAS_BTF_TAG=y
6963
```
7064

7165
There is a `Kconfig` file in this directory whose contents you can append to

tools/sched_ext/include/scx/compat.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ static inline long scx_hotplug_seq(void)
125125
{
126126
int fd;
127127
char buf[32];
128+
char *endptr;
128129
ssize_t len;
129130
long val;
130131

@@ -137,8 +138,10 @@ static inline long scx_hotplug_seq(void)
137138
buf[len] = 0;
138139
close(fd);
139140

140-
val = strtoul(buf, NULL, 10);
141-
SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
141+
errno = 0;
142+
val = strtoul(buf, &endptr, 10);
143+
SCX_BUG_ON(errno == ERANGE || endptr == buf ||
144+
(*endptr != '\n' && *endptr != '\0'), "invalid num hotplug events: %ld", val);
142145

143146
return val;
144147
}

tools/sched_ext/scx_central.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int main(int argc, char **argv)
6666
assert(skel->rodata->nr_cpu_ids > 0);
6767
assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
6868

69-
while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
69+
while ((opt = getopt(argc, argv, "s:c:vh")) != -1) {
7070
switch (opt) {
7171
case 's':
7272
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;

tools/sched_ext/scx_sdt.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ int main(int argc, char **argv)
5454
optind = 1;
5555
skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
5656

57-
while ((opt = getopt(argc, argv, "fvh")) != -1) {
57+
while ((opt = getopt(argc, argv, "vh")) != -1) {
5858
switch (opt) {
5959
case 'v':
6060
verbose = true;

tools/testing/selftests/sched_ext/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
9393
$(CLANG_SYS_INCLUDES) \
9494
-Wall -Wno-compare-distinct-pointer-types \
9595
-Wno-incompatible-function-pointer-types \
96+
-Wno-microsoft-anon-tag \
97+
-fms-extensions \
9698
-O2 -mcpu=v3
9799

98100
# sort removes libbpf duplicates when not cross-building

0 commit comments

Comments
 (0)