Skip to content

Commit dd0a046

Browse files
KAGA-KOKOPeter Zijlstra
authored andcommitted
rseq: Implement syscall entry work for time slice extensions
The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice extension. This allows to handle the rseq_slice_yield() syscall, which is used by user space to relinquish the CPU after finishing the critical section for which it requested an extension. In case the kernel state is still GRANTED, the kernel resets both kernel and user space state with a set of sanity checks. If the kernel state is already cleared, then this raced against the timer or some other interrupt and just clears the work bit. Doing it in syscall entry work allows to catch misbehaving user space, which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the critical section. Contrary to the initial strict requirement to use rseq_slice_yield() arbitrary syscalls are not considered a violation of the ABI contract anymore to allow onion architecture applications, which cannot control the code inside a critical section, to utilize this as well. If the code detects inconsistent user space that result in a SIGSEGV for the application. If the grant was still active and the task was not preempted yet, the work code reschedules immediately before continuing through the syscall. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de
1 parent 99d2592 commit dd0a046

File tree

5 files changed

+112
-10
lines changed

5 files changed

+112
-10
lines changed

include/linux/entry-common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636
SYSCALL_WORK_SYSCALL_EMU | \
3737
SYSCALL_WORK_SYSCALL_AUDIT | \
3838
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
39+
SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \
3940
ARCH_SYSCALL_WORK_ENTER)
40-
4141
#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
4242
SYSCALL_WORK_SYSCALL_TRACE | \
4343
SYSCALL_WORK_SYSCALL_AUDIT | \

include/linux/rseq.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,10 @@ static inline void rseq_syscall(struct pt_regs *regs) { }
164164
#endif /* !CONFIG_DEBUG_RSEQ */
165165

166166
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
167+
void rseq_syscall_enter_work(long syscall);
167168
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
168169
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
170+
static inline void rseq_syscall_enter_work(long syscall) { }
169171
static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
170172
{
171173
return -ENOTSUPP;

include/linux/thread_info.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,17 @@ enum syscall_work_bit {
4646
SYSCALL_WORK_BIT_SYSCALL_AUDIT,
4747
SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
4848
SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
49+
SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
4950
};
5051

51-
#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
52-
#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
53-
#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
54-
#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
55-
#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
56-
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
57-
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
52+
#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
53+
#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
54+
#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
55+
#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
56+
#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
57+
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
58+
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
59+
#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
5860
#endif
5961

6062
#include <asm/thread_info.h>

kernel/entry/syscall-common.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
1717
}
1818
}
1919

20-
long syscall_trace_enter(struct pt_regs *regs, long syscall,
21-
unsigned long work)
20+
long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work)
2221
{
2322
long ret = 0;
2423

@@ -32,6 +31,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
3231
return -1L;
3332
}
3433

34+
/*
35+
* User space got a time slice extension granted and relinquishes
36+
* the CPU. The work stops the slice timer to avoid an extra round
37+
* through hrtimer_interrupt().
38+
*/
39+
if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
40+
rseq_syscall_enter_work(syscall);
41+
3542
/* Handle ptrace */
3643
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
3744
ret = ptrace_report_syscall_entry(regs);

kernel/rseq.c

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,97 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
502502
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
503503
DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
504504

505+
static inline void rseq_slice_set_need_resched(struct task_struct *curr)
506+
{
507+
/*
508+
* The interrupt guard is required to prevent inconsistent state in
509+
* this case:
510+
*
511+
* set_tsk_need_resched()
512+
* --> Interrupt
513+
* wakeup()
514+
* set_tsk_need_resched()
515+
* set_preempt_need_resched()
516+
* schedule_on_return()
517+
* clear_tsk_need_resched()
518+
* clear_preempt_need_resched()
519+
* set_preempt_need_resched() <- Inconsistent state
520+
*
521+
* This is safe vs. a remote set of TIF_NEED_RESCHED because that
522+
* only sets the already set bit and does not create inconsistent
523+
* state.
524+
*/
525+
scoped_guard(irq)
526+
set_need_resched_current();
527+
}
528+
529+
static void rseq_slice_validate_ctrl(u32 expected)
530+
{
531+
u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
532+
u32 uval;
533+
534+
if (get_user(uval, sctrl) || uval != expected)
535+
force_sig(SIGSEGV);
536+
}
537+
538+
/*
539+
* Invoked from syscall entry if a time slice extension was granted and the
540+
* kernel did not clear it before user space left the critical section.
541+
*
542+
* While the recommended way to relinquish the CPU side effect free is
543+
* rseq_slice_yield(2), any syscall within a granted slice terminates the
544+
* grant and immediately reschedules if required. This supports onion layer
545+
* applications, where the code requesting the grant cannot control the
546+
* code within the critical section.
547+
*/
548+
void rseq_syscall_enter_work(long syscall)
549+
{
550+
struct task_struct *curr = current;
551+
struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
552+
553+
clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
554+
555+
if (static_branch_unlikely(&rseq_debug_enabled))
556+
rseq_slice_validate_ctrl(ctrl.all);
557+
558+
/*
559+
* The kernel might have raced, revoked the grant and updated
560+
* userspace, but kept the SLICE work set.
561+
*/
562+
if (!ctrl.granted)
563+
return;
564+
565+
/*
566+
* Required to make set_tsk_need_resched() correct on PREEMPT[RT]
567+
* kernels. Leaving the scope will reschedule on preemption models
568+
* FULL, LAZY and RT if necessary.
569+
*/
570+
scoped_guard(preempt) {
571+
/*
572+
* Now that preemption is disabled, quickly check whether
573+
* the task was already rescheduled before arriving here.
574+
*/
575+
if (!curr->rseq.event.sched_switch) {
576+
rseq_slice_set_need_resched(curr);
577+
578+
if (syscall == __NR_rseq_slice_yield) {
579+
rseq_stat_inc(rseq_stats.s_yielded);
580+
/* Update the yielded state for syscall return */
581+
curr->rseq.slice.yielded = 1;
582+
} else {
583+
rseq_stat_inc(rseq_stats.s_aborted);
584+
}
585+
}
586+
}
587+
/* Reschedule on NONE/VOLUNTARY preemption models */
588+
cond_resched();
589+
590+
/* Clear the grant in kernel state and user space */
591+
curr->rseq.slice.state.granted = false;
592+
if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
593+
force_sig(SIGSEGV);
594+
}
595+
505596
int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
506597
{
507598
switch (arg2) {

0 commit comments

Comments
 (0)