Skip to content

Commit 0ac3b5c

Browse files
KAGA-KOKOPeter Zijlstra
authored andcommitted
rseq: Implement time slice extension enforcement timer
If a time slice extension is granted and the reschedule delayed, the kernel has to ensure that user space cannot abuse the extension and exceed the maximum granted time. It was suggested to implement this via the existing hrtick() timer in the scheduler, but that turned out to be problematic for several reasons: 1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled independently of CONFIG_HIGHRES_TIMERS 2) HRTICK usage in the scheduler can be runtime disabled or is only used for certain aspects of scheduling. 3) The function is calling into the scheduler code and that might have unexpected consequences when this is invoked due to a time slice enforcement expiry. Especially when the task managed to clear the grant via sched_yield(0). It would be possible to address #2 and #3 by storing state in the scheduler, but that is extra complexity and fragility for no value. Implement a dedicated per CPU hrtimer instead, which is solely used for the purpose of time slice enforcement. The timer is armed when an extension was granted right before actually returning to user mode in rseq_exit_to_user_mode_restart(). It is disarmed, when the task relinquishes the CPU. This is expensive as the timer is probably the first expiring timer on the CPU, which means it has to reprogram the hardware. But that's less expensive than going through a full hrtimer interrupt cycle for nothing. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20251215155709.068329497@linutronix.de
1 parent dd0a046 commit 0ac3b5c

File tree

4 files changed

+170
-13
lines changed

4 files changed

+170
-13
lines changed

Documentation/admin-guide/sysctl/kernel.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,17 @@ reboot-cmd (SPARC only)
12481248
ROM/Flash boot loader. Maybe to tell it what to do after
12491249
rebooting. ???
12501250

1251+
rseq_slice_extension_nsec
1252+
=========================
1253+
1254+
A task can request to delay its scheduling if it is in a critical section
1255+
via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum
1256+
allowed extension in nanoseconds before scheduling of the task is enforced.
1257+
Default value is 10000ns (10us). The possible range is 10000ns (10us) to
1258+
50000ns (50us).
1259+
1260+
This value has a direct correlation to the worst case scheduling latency;
1261+
increment at your own risk.
12511262

12521263
sched_energy_aware
12531264
==================

include/linux/rseq_entry.h

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,24 @@ static __always_inline bool rseq_slice_extension_enabled(void)
8787
{
8888
return static_branch_likely(&rseq_slice_extension_key);
8989
}
90+
91+
extern unsigned int rseq_slice_ext_nsecs;
92+
bool __rseq_arm_slice_extension_timer(void);
93+
94+
static __always_inline bool rseq_arm_slice_extension_timer(void)
95+
{
96+
if (!rseq_slice_extension_enabled())
97+
return false;
98+
99+
if (likely(!current->rseq.slice.state.granted))
100+
return false;
101+
102+
return __rseq_arm_slice_extension_timer();
103+
}
104+
90105
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
91106
static inline bool rseq_slice_extension_enabled(void) { return false; }
107+
static inline bool rseq_arm_slice_extension_timer(void) { return false; }
92108
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
93109

94110
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -543,17 +559,19 @@ static __always_inline void clear_tif_rseq(void) { }
543559
static __always_inline bool
544560
rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
545561
{
546-
if (likely(!test_tif_rseq(ti_work)))
547-
return false;
548-
549-
if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
550-
current->rseq.event.slowpath = true;
551-
set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
552-
return true;
562+
if (unlikely(test_tif_rseq(ti_work))) {
563+
if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
564+
current->rseq.event.slowpath = true;
565+
set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
566+
return true;
567+
}
568+
clear_tif_rseq();
553569
}
554-
555-
clear_tif_rseq();
556-
return false;
570+
/*
571+
* Arm the slice extension timer if nothing to do anymore and the
572+
* task really goes out to user space.
573+
*/
574+
return rseq_arm_slice_extension_timer();
557575
}
558576

559577
#else /* CONFIG_GENERIC_ENTRY */

include/linux/rseq_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,12 @@ union rseq_slice_state {
8989
/**
9090
* struct rseq_slice - Status information for rseq time slice extension
9191
* @state: Time slice extension state
92+
* @expires: The time when a grant expires
9293
* @yielded: Indicator for rseq_slice_yield()
9394
*/
9495
struct rseq_slice {
9596
union rseq_slice_state state;
97+
u64 expires;
9698
u8 yielded;
9799
};
98100

kernel/rseq.c

Lines changed: 129 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@
7171
#define RSEQ_BUILD_SLOW_PATH
7272

7373
#include <linux/debugfs.h>
74+
#include <linux/hrtimer.h>
75+
#include <linux/percpu.h>
7476
#include <linux/prctl.h>
7577
#include <linux/ratelimit.h>
7678
#include <linux/rseq_entry.h>
@@ -500,8 +502,91 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
500502
}
501503

502504
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
505+
struct slice_timer {
506+
struct hrtimer timer;
507+
void *cookie;
508+
};
509+
510+
unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC;
511+
static DEFINE_PER_CPU(struct slice_timer, slice_timer);
503512
DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
504513

514+
/*
515+
* When the timer expires and the task is still in user space, the return
516+
* from interrupt will revoke the grant and schedule. If the task already
517+
* entered the kernel via a syscall and the timer fires before the syscall
518+
* work was able to cancel it, then depending on the preemption model this
519+
* will either reschedule on return from interrupt or in the syscall work
520+
* below.
521+
*/
522+
static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
523+
{
524+
struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
525+
526+
/*
527+
* Validate that the task which armed the timer is still on the
528+
* CPU. It could have been scheduled out without canceling the
529+
* timer.
530+
*/
531+
if (st->cookie == current && current->rseq.slice.state.granted) {
532+
rseq_stat_inc(rseq_stats.s_expired);
533+
set_need_resched_current();
534+
}
535+
return HRTIMER_NORESTART;
536+
}
537+
538+
bool __rseq_arm_slice_extension_timer(void)
539+
{
540+
struct slice_timer *st = this_cpu_ptr(&slice_timer);
541+
struct task_struct *curr = current;
542+
543+
lockdep_assert_irqs_disabled();
544+
545+
/*
546+
* This check prevents a task, which got a time slice extension
547+
* granted, from exceeding the maximum scheduling latency when the
548+
* grant expired before going out to user space. Don't bother to
549+
* clear the grant here, it will be cleaned up automatically before
550+
* going out to user space after being scheduled back in.
551+
*/
552+
if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
553+
set_need_resched_current();
554+
return true;
555+
}
556+
557+
/*
558+
* Store the task pointer as a cookie for comparison in the timer
559+
* function. This is safe as the timer is CPU local and cannot be
560+
* in the expiry function at this point.
561+
*/
562+
st->cookie = curr;
563+
hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
564+
/* Arm the syscall entry work */
565+
set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
566+
return false;
567+
}
568+
569+
static void rseq_cancel_slice_extension_timer(void)
570+
{
571+
struct slice_timer *st = this_cpu_ptr(&slice_timer);
572+
573+
/*
574+
* st->cookie can be safely read as preemption is disabled and the
575+
* timer is CPU local.
576+
*
577+
* As this is most probably the first expiring timer, the cancel is
578+
* expensive as it has to reprogram the hardware, but that's less
579+
* expensive than going through a full hrtimer_interrupt() cycle
580+
* for nothing.
581+
*
582+
* hrtimer_try_to_cancel() is sufficient here as the timer is CPU
583+
* local and once the hrtimer code disabled interrupts the timer
584+
* callback cannot be running.
585+
*/
586+
if (st->cookie == current)
587+
hrtimer_try_to_cancel(&st->timer);
588+
}
589+
505590
static inline void rseq_slice_set_need_resched(struct task_struct *curr)
506591
{
507592
/*
@@ -563,11 +648,14 @@ void rseq_syscall_enter_work(long syscall)
563648
return;
564649

565650
/*
566-
* Required to make set_tsk_need_resched() correct on PREEMPT[RT]
567-
* kernels. Leaving the scope will reschedule on preemption models
568-
* FULL, LAZY and RT if necessary.
651+
* Required to stabilize the per CPU timer pointer and to make
652+
* set_tsk_need_resched() correct on PREEMPT[RT] kernels.
653+
*
654+
* Leaving the scope will reschedule on preemption models FULL,
655+
* LAZY and RT if necessary.
569656
*/
570657
scoped_guard(preempt) {
658+
rseq_cancel_slice_extension_timer();
571659
/*
572660
* Now that preemption is disabled, quickly check whether
573661
* the task was already rescheduled before arriving here.
@@ -665,6 +753,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
665753
return yielded;
666754
}
667755

756+
#ifdef CONFIG_SYSCTL
757+
static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
758+
static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
759+
760+
static const struct ctl_table rseq_slice_ext_sysctl[] = {
761+
{
762+
.procname = "rseq_slice_extension_nsec",
763+
.data = &rseq_slice_ext_nsecs,
764+
.maxlen = sizeof(unsigned int),
765+
.mode = 0644,
766+
.proc_handler = proc_douintvec_minmax,
767+
.extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min,
768+
.extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max,
769+
},
770+
};
771+
772+
static void rseq_slice_sysctl_init(void)
773+
{
774+
if (rseq_slice_extension_enabled())
775+
register_sysctl_init("kernel", rseq_slice_ext_sysctl);
776+
}
777+
#else /* CONFIG_SYSCTL */
778+
static inline void rseq_slice_sysctl_init(void) { }
779+
#endif /* !CONFIG_SYSCTL */
780+
668781
static int __init rseq_slice_cmdline(char *str)
669782
{
670783
bool on;
@@ -677,4 +790,17 @@ static int __init rseq_slice_cmdline(char *str)
677790
return 1;
678791
}
679792
__setup("rseq_slice_ext=", rseq_slice_cmdline);
793+
794+
static int __init rseq_slice_init(void)
795+
{
796+
unsigned int cpu;
797+
798+
for_each_possible_cpu(cpu) {
799+
hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
800+
CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
801+
}
802+
rseq_slice_sysctl_init();
803+
return 0;
804+
}
805+
device_initcall(rseq_slice_init);
680806
#endif /* CONFIG_RSEQ_SLICE_EXTENSION */

0 commit comments

Comments
 (0)