Skip to content

Commit 653fda7

Browse files
committed
sched/mmcid: Switch over to the new mechanism
Now that all pieces are in place, change the implementations of sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict ownership scheme and switch context_switch() over to use the new mm_cid_schedin() functionality. The common case is that there is no mode change required, which makes fork() and exit() just update the user count and the constraints. In case that a new user would exceed the CID space limit the fork() context handles the transition to per CPU mode with mm::mm_cid::mutex held. exit() handles the transition back to per task mode when the user count drops below the switch back threshold. fork() might also be forced to handle a deferred switch back to per task mode, when a affinity change increased the number of allowed CPUs enough. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de
1 parent 9da6ccb commit 653fda7

File tree

5 files changed

+103
-116
lines changed

5 files changed

+103
-116
lines changed

include/linux/rseq.h

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -84,24 +84,6 @@ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
8484
t->rseq.event.ids_changed = true;
8585
}
8686

87-
/*
88-
* Invoked from switch_mm_cid() in context switch when the task gets a MM
89-
* CID assigned.
90-
*
91-
* This does not raise TIF_NOTIFY_RESUME as that happens in
92-
* rseq_sched_switch_event().
93-
*/
94-
static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
95-
{
96-
/*
97-
* Requires a comparison as the switch_mm_cid() code does not
98-
* provide a conditional for it readily. So avoid excessive updates
99-
* when nothing changes.
100-
*/
101-
if (t->rseq.ids.mm_cid != cid)
102-
t->rseq.event.ids_changed = true;
103-
}
104-
10587
/* Enforce a full update after RSEQ registration and when execve() failed */
10688
static inline void rseq_force_update(void)
10789
{
@@ -169,7 +151,6 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
169151
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
170152
static inline void rseq_sched_switch_event(struct task_struct *t) { }
171153
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
172-
static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
173154
static inline void rseq_force_update(void) { }
174155
static inline void rseq_virt_userspace_exit(void) { }
175156
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }

include/linux/rseq_types.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,18 +101,18 @@ struct rseq_data { };
101101
/**
102102
* struct sched_mm_cid - Storage for per task MM CID data
103103
* @active: MM CID is active for the task
104-
* @cid: The CID associated to the task
105-
* @last_cid: The last CID associated to the task
104+
* @cid: The CID associated to the task either permanently or
105+
* borrowed from the CPU
106106
*/
107107
struct sched_mm_cid {
108108
unsigned int active;
109109
unsigned int cid;
110-
unsigned int last_cid;
111110
};
112111

113112
/**
114113
* struct mm_cid_pcpu - Storage for per CPU MM_CID data
115-
* @cid: The CID associated to the CPU
114+
* @cid: The CID associated to the CPU either permanently or
115+
* while a task with a CID is running
116116
*/
117117
struct mm_cid_pcpu {
118118
unsigned int cid;

kernel/fork.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
956956

957957
#ifdef CONFIG_SCHED_MM_CID
958958
tsk->mm_cid.cid = MM_CID_UNSET;
959-
tsk->mm_cid.last_cid = MM_CID_UNSET;
960959
tsk->mm_cid.active = 0;
961960
#endif
962961
return tsk;

kernel/sched/core.c

Lines changed: 99 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5307,7 +5307,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
53075307
}
53085308
}
53095309

5310-
switch_mm_cid(prev, next);
5310+
mm_cid_switch_to(prev, next);
53115311

53125312
/*
53135313
* Tell rseq that the task was scheduled in. Must be after
@@ -10624,7 +10624,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
1062410624
return true;
1062510625
}
1062610626

10627-
static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
10627+
static void mm_cid_fixup_tasks_to_cpus(void)
1062810628
{
1062910629
struct mm_struct *mm = current->mm;
1063010630
struct task_struct *p, *t;
@@ -10674,25 +10674,81 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
1067410674
void sched_mm_cid_fork(struct task_struct *t)
1067510675
{
1067610676
struct mm_struct *mm = t->mm;
10677+
bool percpu;
1067710678

1067810679
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
1067910680

1068010681
guard(mutex)(&mm->mm_cid.mutex);
10681-
scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
10682-
sched_mm_cid_add_user(t, mm);
10683-
/* Preset last_cid for mm_cid_select() */
10684-
t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
10682+
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
10683+
struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
10684+
10685+
/* First user ? */
10686+
if (!mm->mm_cid.users) {
10687+
sched_mm_cid_add_user(t, mm);
10688+
t->mm_cid.cid = mm_get_cid(mm);
10689+
/* Required for execve() */
10690+
pcp->cid = t->mm_cid.cid;
10691+
return;
10692+
}
10693+
10694+
if (!sched_mm_cid_add_user(t, mm)) {
10695+
if (!mm->mm_cid.percpu)
10696+
t->mm_cid.cid = mm_get_cid(mm);
10697+
return;
10698+
}
10699+
10700+
/* Handle the mode change and transfer current's CID */
10701+
percpu = !!mm->mm_cid.percpu;
10702+
if (!percpu)
10703+
mm_cid_transit_to_task(current, pcp);
10704+
else
10705+
mm_cid_transfer_to_cpu(current, pcp);
10706+
}
10707+
10708+
if (percpu) {
10709+
mm_cid_fixup_tasks_to_cpus();
10710+
} else {
10711+
mm_cid_fixup_cpus_to_tasks(mm);
10712+
t->mm_cid.cid = mm_get_cid(mm);
1068510713
}
1068610714
}
1068710715

1068810716
static bool sched_mm_cid_remove_user(struct task_struct *t)
1068910717
{
1069010718
t->mm_cid.active = 0;
10691-
mm_unset_cid_on_task(t);
10719+
scoped_guard(preempt) {
10720+
/* Clear the transition bit */
10721+
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
10722+
mm_unset_cid_on_task(t);
10723+
}
1069210724
t->mm->mm_cid.users--;
1069310725
return mm_update_max_cids(t->mm);
1069410726
}
1069510727

10728+
static bool __sched_mm_cid_exit(struct task_struct *t)
10729+
{
10730+
struct mm_struct *mm = t->mm;
10731+
10732+
if (!sched_mm_cid_remove_user(t))
10733+
return false;
10734+
/*
10735+
* Contrary to fork() this only deals with a switch back to per
10736+
* task mode either because the above decreased users or an
10737+
* affinity change increased the number of allowed CPUs and the
10738+
* deferred fixup did not run yet.
10739+
*/
10740+
if (WARN_ON_ONCE(mm->mm_cid.percpu))
10741+
return false;
10742+
/*
10743+
* A failed fork(2) cleanup never gets here, so @current must have
10744+
* the same MM as @t. That's true for exit() and the failed
10745+
* pthread_create() cleanup case.
10746+
*/
10747+
if (WARN_ON_ONCE(current->mm != mm))
10748+
return false;
10749+
return true;
10750+
}
10751+
1069610752
/*
1069710753
* When a task exits, the MM CID held by the task is not longer required as
1069810754
* the task cannot return to user space.
@@ -10703,10 +10759,43 @@ void sched_mm_cid_exit(struct task_struct *t)
1070310759

1070410760
if (!mm || !t->mm_cid.active)
1070510761
return;
10762+
/*
10763+
* Ensure that only one instance is doing MM CID operations within
10764+
* a MM. The common case is uncontended. The rare fixup case adds
10765+
* some overhead.
10766+
*/
10767+
scoped_guard(mutex, &mm->mm_cid.mutex) {
10768+
/* mm_cid::mutex is sufficient to protect mm_cid::users */
10769+
if (likely(mm->mm_cid.users > 1)) {
10770+
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
10771+
if (!__sched_mm_cid_exit(t))
10772+
return;
10773+
/* Mode change required. Transfer currents CID */
10774+
mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
10775+
}
10776+
mm_cid_fixup_cpus_to_tasks(mm);
10777+
return;
10778+
}
10779+
/* Last user */
10780+
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
10781+
/* Required across execve() */
10782+
if (t == current)
10783+
mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
10784+
/* Ignore mode change. There is nothing to do. */
10785+
sched_mm_cid_remove_user(t);
10786+
}
10787+
}
1070610788

10707-
guard(mutex)(&mm->mm_cid.mutex);
10708-
scoped_guard(raw_spinlock, &mm->mm_cid.lock)
10709-
sched_mm_cid_remove_user(t);
10789+
/*
10790+
* As this is the last user (execve(), process exit or failed
10791+
* fork(2)) there is no concurrency anymore.
10792+
*
10793+
* Synchronize eventually pending work to ensure that there are no
10794+
* dangling references left. @t->mm_cid.users is zero so nothing
10795+
* can queue this work anymore.
10796+
*/
10797+
irq_work_sync(&mm->mm_cid.irq_work);
10798+
cancel_work_sync(&mm->mm_cid.work);
1071010799
}
1071110800

1071210801
/* Deactivate MM CID allocation across execve() */
@@ -10719,18 +10808,12 @@ void sched_mm_cid_before_execve(struct task_struct *t)
1071910808
void sched_mm_cid_after_execve(struct task_struct *t)
1072010809
{
1072110810
sched_mm_cid_fork(t);
10722-
guard(preempt)();
10723-
mm_cid_select(t);
1072410811
}
1072510812

1072610813
static void mm_cid_work_fn(struct work_struct *work)
1072710814
{
1072810815
struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
1072910816

10730-
/* Make it compile, but not functional yet */
10731-
if (!IS_ENABLED(CONFIG_NEW_MM_CID))
10732-
return;
10733-
1073410817
guard(mutex)(&mm->mm_cid.mutex);
1073510818
/* Did the last user task exit already? */
1073610819
if (!mm->mm_cid.users)

kernel/sched/sched.h

Lines changed: 0 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -3745,83 +3745,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
37453745
mm_cid_schedin(next);
37463746
}
37473747

3748-
/* Active implementation */
3749-
static inline void init_sched_mm_cid(struct task_struct *t)
3750-
{
3751-
struct mm_struct *mm = t->mm;
3752-
unsigned int max_cid;
3753-
3754-
if (!mm)
3755-
return;
3756-
3757-
/* Preset last_mm_cid */
3758-
max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
3759-
t->mm_cid.last_cid = max_cid - 1;
3760-
}
3761-
3762-
static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
3763-
{
3764-
struct mm_struct *mm = t->mm;
3765-
3766-
if (cid >= max_cids)
3767-
return false;
3768-
if (test_and_set_bit(cid, mm_cidmask(mm)))
3769-
return false;
3770-
t->mm_cid.cid = t->mm_cid.last_cid = cid;
3771-
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
3772-
return true;
3773-
}
3774-
3775-
static inline bool mm_cid_get(struct task_struct *t)
3776-
{
3777-
struct mm_struct *mm = t->mm;
3778-
unsigned int max_cids;
3779-
3780-
max_cids = READ_ONCE(mm->mm_cid.max_cids);
3781-
3782-
/* Try to reuse the last CID of this task */
3783-
if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
3784-
return true;
3785-
3786-
/* Try to reuse the last CID of this mm on this CPU */
3787-
if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
3788-
return true;
3789-
3790-
/* Try the first zero bit in the cidmask. */
3791-
return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids);
3792-
}
3793-
3794-
static inline void mm_cid_select(struct task_struct *t)
3795-
{
3796-
/*
3797-
* mm_cid_get() can fail when the maximum CID, which is determined
3798-
* by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
3799-
* That's a transient failure as there cannot be more tasks
3800-
* concurrently on a CPU (or about to be scheduled in) than that.
3801-
*/
3802-
for (;;) {
3803-
if (mm_cid_get(t))
3804-
break;
3805-
}
3806-
}
3807-
3808-
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
3809-
{
3810-
if (prev->mm_cid.active) {
3811-
if (prev->mm_cid.cid != MM_CID_UNSET)
3812-
clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
3813-
prev->mm_cid.cid = MM_CID_UNSET;
3814-
}
3815-
3816-
if (next->mm_cid.active) {
3817-
mm_cid_select(next);
3818-
rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
3819-
}
3820-
}
3821-
38223748
#else /* !CONFIG_SCHED_MM_CID: */
3823-
static inline void mm_cid_select(struct task_struct *t) { }
3824-
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
38253749
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
38263750
#endif /* !CONFIG_SCHED_MM_CID */
38273751

0 commit comments

Comments
 (0)