Skip to content

Commit fbd0e71

Browse files
committed
sched/mmcid: Provide CID ownership mode fixup functions
CIDs are either owned by tasks or by CPUs. The ownership mode depends on the number of tasks related to a MM and the number of CPUs on which these tasks are theoretically allowed to run on. Theoretically because that number is the superset of CPU affinities of all tasks which only grows and never shrinks. Switching to per CPU mode happens when the user count becomes greater than the maximum number of CIDs, which is calculated by: opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); max_cids = min(1.25 * opt_cids, nr_cpu_ids); The +25% allowance is useful for tight CPU masks in scenarios where only a few threads are created and destroyed to avoid frequent mode switches. Though this allowance shrinks, the closer opt_cids becomes to nr_cpu_ids, which is the (unfortunate) hard ABI limit. At the point of switching to per CPU mode the new user is not yet visible in the system, so the task which initiated the fork() runs the fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either transfers each tasks owned CID to the CPU the task runs on or drops it into the CID pool if a task is not on a CPU at that point in time. Tasks which schedule in before the task walk reaches them do the handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's guaranteed that no task related to that MM owns a CID anymore. Switching back to task mode happens when the user count goes below the threshold which was recorded on the per CPU mode switch: pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2); This threshold is updated when a affinity change increases the number of allowed CPUs for the MM, which might cause a switch back to per task mode. If the switch back was initiated by a exiting task, then that task runs the fixup function. If it was initiated by a affinity change, then it's run either in the deferred update function in context of a workqueue or by a task which forks a new one or by a task which exits. Whatever happens first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and either transfers the CPU owned CIDs to a related task which runs on the CPU or drops it into the pool. Tasks which schedule in on a CPU which the walk did not cover yet do the handover themselves. This transition from CPU to per task ownership happens in two phases: 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the task CID and denotes that the CID is only temporarily owned by the task. When it schedules out the task drops the CID back into the pool if this bit is set. 2) The initiating context walks the per CPU space and after completion clears mm:mm_cid.transit. After that point the CIDs are strictly task owned again. This two phase transition is required to prevent CID space exhaustion during the transition as a direct transfer of ownership would fail if two tasks are scheduled in on the same CPU before the fixup freed per CPU CIDs. When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID related to that MM is owned by a CPU anymore. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20251119172550.088189028@linutronix.de
1 parent 9a723ed commit fbd0e71

File tree

2 files changed

+259
-26
lines changed

2 files changed

+259
-26
lines changed

include/linux/rseq_types.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,15 @@ struct mm_cid_pcpu {
122122
* @percpu: Set, when CIDs are in per CPU mode
123123
* @transit: Set to MM_CID_TRANSIT during a mode change transition phase
124124
* @max_cids: The exclusive maximum CID value for allocation and convergence
125-
* @lock: Spinlock to protect all fields except @pcpu. It also protects
126-
* the MM cid cpumask and the MM cidmask bitmap.
125+
* @lock: Spinlock to protect against affinity setting which can't take @mutex
127126
* @mutex: Mutex to serialize forks and exits related to this mm
128127
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
129128
* is growth only.
130129
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
131130
* as that is modified by mmget()/mm_put() by other entities which
132131
* do not actually share the MM.
132+
* @pcpu_thrs: Threshold for switching back from per CPU mode
133+
* @update_deferred: A deferred switch back to per task mode is pending.
133134
*/
134135
struct mm_mm_cid {
135136
/* Hotpath read mostly members */
@@ -144,6 +145,8 @@ struct mm_mm_cid {
144145
/* Low frequency modified */
145146
unsigned int nr_cpus_allowed;
146147
unsigned int users;
148+
unsigned int pcpu_thrs;
149+
unsigned int update_deferred;
147150
}____cacheline_aligned_in_smp;
148151
#else /* CONFIG_SCHED_MM_CID */
149152
struct mm_mm_cid { };

kernel/sched/core.c

Lines changed: 254 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10396,43 +10396,270 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
1039610396
* task needs to drop the CID into the pool when scheduling out. Both bits
1039710397
* (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
1039810398
* actually handed over to user space in the RSEQ memory.
10399+
*
10400+
* Mode switching:
10401+
*
10402+
* Switching to per CPU mode happens when the user count becomes greater
10403+
* than the maximum number of CIDs, which is calculated by:
10404+
*
10405+
* opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
10406+
* max_cids = min(1.25 * opt_cids, num_possible_cpus());
10407+
*
10408+
* The +25% allowance is useful for tight CPU masks in scenarios where only
10409+
* a few threads are created and destroyed to avoid frequent mode
10410+
* switches. Though this allowance shrinks, the closer opt_cids becomes to
10411+
* num_possible_cpus(), which is the (unfortunate) hard ABI limit.
10412+
*
10413+
* At the point of switching to per CPU mode the new user is not yet
10414+
* visible in the system, so the task which initiated the fork() runs the
10415+
* fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
10416+
* either transfers each tasks owned CID to the CPU the task runs on or
10417+
* drops it into the CID pool if a task is not on a CPU at that point in
10418+
* time. Tasks which schedule in before the task walk reaches them do the
10419+
* handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
10420+
* it's guaranteed that no task related to that MM owns a CID anymore.
10421+
*
10422+
* Switching back to task mode happens when the user count goes below the
10423+
* threshold which was recorded on the per CPU mode switch:
10424+
*
10425+
* pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
10426+
*
10427+
* This threshold is updated when a affinity change increases the number of
10428+
* allowed CPUs for the MM, which might cause a switch back to per task
10429+
* mode.
10430+
*
10431+
* If the switch back was initiated by a exiting task, then that task runs
10432+
* the fixup function. If it was initiated by a affinity change, then it's
10433+
* run either in the deferred update function in context of a workqueue or
10434+
* by a task which forks a new one or by a task which exits. Whatever
10435+
* happens first. mm_cid_fixup_cpus_to_task() walks through the possible
10436+
* CPUs and either transfers the CPU owned CIDs to a related task which
10437+
* runs on the CPU or drops it into the pool. Tasks which schedule in on a
10438+
* CPU which the walk did not cover yet do the handover themself.
10439+
*
10440+
* This transition from CPU to per task ownership happens in two phases:
10441+
*
10442+
* 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
10443+
* CID and denotes that the CID is only temporarily owned by the
10444+
* task. When it schedules out the task drops the CID back into the
10445+
* pool if this bit is set.
10446+
*
10447+
* 2) The initiating context walks the per CPU space and after completion
10448+
* clears mm:mm_cid.transit. So after that point the CIDs are strictly
10449+
* task owned again.
10450+
*
10451+
* This two phase transition is required to prevent CID space exhaustion
10452+
* during the transition as a direct transfer of ownership would fail if
10453+
* two tasks are scheduled in on the same CPU before the fixup freed per
10454+
* CPU CIDs.
10455+
*
10456+
* When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
10457+
* related to that MM is owned by a CPU anymore.
1039910458
*/
1040010459

1040110460
/*
1040210461
* Update the CID range properties when the constraints change. Invoked via
1040310462
* fork(), exit() and affinity changes
1040410463
*/
10405-
static void mm_update_max_cids(struct mm_struct *mm)
10464+
static void __mm_update_max_cids(struct mm_mm_cid *mc)
10465+
{
10466+
unsigned int opt_cids, max_cids;
10467+
10468+
/* Calculate the new optimal constraint */
10469+
opt_cids = min(mc->nr_cpus_allowed, mc->users);
10470+
10471+
/* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
10472+
max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
10473+
WRITE_ONCE(mc->max_cids, max_cids);
10474+
}
10475+
10476+
static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
10477+
{
10478+
unsigned int opt_cids;
10479+
10480+
opt_cids = min(mc->nr_cpus_allowed, mc->users);
10481+
/* Has to be at least 1 because 0 indicates PCPU mode off */
10482+
return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
10483+
}
10484+
10485+
static bool mm_update_max_cids(struct mm_struct *mm)
1040610486
{
1040710487
struct mm_mm_cid *mc = &mm->mm_cid;
10408-
unsigned int max_cids;
1040910488

1041010489
lockdep_assert_held(&mm->mm_cid.lock);
1041110490

10412-
/* Calculate the new maximum constraint */
10413-
max_cids = min(mc->nr_cpus_allowed, mc->users);
10414-
WRITE_ONCE(mc->max_cids, max_cids);
10491+
/* Clear deferred mode switch flag. A change is handled by the caller */
10492+
mc->update_deferred = false;
10493+
__mm_update_max_cids(mc);
10494+
10495+
/* Check whether owner mode must be changed */
10496+
if (!mc->percpu) {
10497+
/* Enable per CPU mode when the number of users is above max_cids */
10498+
if (mc->users > mc->max_cids)
10499+
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
10500+
} else {
10501+
/* Switch back to per task if user count under threshold */
10502+
if (mc->users < mc->pcpu_thrs)
10503+
mc->pcpu_thrs = 0;
10504+
}
10505+
10506+
/* Mode change required? */
10507+
if (!!mc->percpu == !!mc->pcpu_thrs)
10508+
return false;
10509+
/* When switching back to per TASK mode, set the transition flag */
10510+
if (!mc->pcpu_thrs)
10511+
WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
10512+
WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
10513+
return true;
1041510514
}
1041610515

1041710516
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
1041810517
{
1041910518
struct cpumask *mm_allowed;
10519+
struct mm_mm_cid *mc;
1042010520
unsigned int weight;
1042110521

1042210522
if (!mm || !READ_ONCE(mm->mm_cid.users))
1042310523
return;
10424-
1042510524
/*
1042610525
* mm::mm_cid::mm_cpus_allowed is the superset of each threads
1042710526
* allowed CPUs mask which means it can only grow.
1042810527
*/
10429-
guard(raw_spinlock)(&mm->mm_cid.lock);
10528+
mc = &mm->mm_cid;
10529+
guard(raw_spinlock)(&mc->lock);
1043010530
mm_allowed = mm_cpus_allowed(mm);
1043110531
weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
10432-
if (weight == mm->mm_cid.nr_cpus_allowed)
10532+
if (weight == mc->nr_cpus_allowed)
10533+
return;
10534+
10535+
WRITE_ONCE(mc->nr_cpus_allowed, weight);
10536+
__mm_update_max_cids(mc);
10537+
if (!mc->percpu)
1043310538
return;
10434-
WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
10435-
mm_update_max_cids(mm);
10539+
10540+
/* Adjust the threshold to the wider set */
10541+
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
10542+
10543+
/* Scheduling of deferred mode switch goes here */
10544+
}
10545+
10546+
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
10547+
{
10548+
if (cid_on_cpu(t->mm_cid.cid)) {
10549+
unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
10550+
10551+
t->mm_cid.cid = cid_to_transit_cid(cid);
10552+
pcp->cid = t->mm_cid.cid;
10553+
}
10554+
}
10555+
10556+
static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
10557+
{
10558+
unsigned int cpu;
10559+
10560+
/* Walk the CPUs and fixup all stale CIDs */
10561+
for_each_possible_cpu(cpu) {
10562+
struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
10563+
struct rq *rq = cpu_rq(cpu);
10564+
10565+
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
10566+
guard(rq_lock_irq)(rq);
10567+
/* Is the CID still owned by the CPU? */
10568+
if (cid_on_cpu(pcp->cid)) {
10569+
/*
10570+
* If rq->curr has @mm, transfer it with the
10571+
* transition bit set. Otherwise drop it.
10572+
*/
10573+
if (rq->curr->mm == mm && rq->curr->mm_cid.active)
10574+
mm_cid_transit_to_task(rq->curr, pcp);
10575+
else
10576+
mm_drop_cid_on_cpu(mm, pcp);
10577+
10578+
} else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
10579+
unsigned int cid = rq->curr->mm_cid.cid;
10580+
10581+
/* Ensure it has the transition bit set */
10582+
if (!cid_in_transit(cid)) {
10583+
cid = cid_to_transit_cid(cid);
10584+
rq->curr->mm_cid.cid = cid;
10585+
pcp->cid = cid;
10586+
}
10587+
}
10588+
}
10589+
/* Clear the transition bit */
10590+
WRITE_ONCE(mm->mm_cid.transit, 0);
10591+
}
10592+
10593+
static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
10594+
{
10595+
if (cid_on_task(t->mm_cid.cid)) {
10596+
t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
10597+
pcp->cid = t->mm_cid.cid;
10598+
}
10599+
}
10600+
10601+
static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
10602+
{
10603+
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
10604+
guard(task_rq_lock)(t);
10605+
/* If the task is not active it is not in the users count */
10606+
if (!t->mm_cid.active)
10607+
return false;
10608+
if (cid_on_task(t->mm_cid.cid)) {
10609+
/* If running on the CPU, transfer the CID, otherwise drop it */
10610+
if (task_rq(t)->curr == t)
10611+
mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
10612+
else
10613+
mm_unset_cid_on_task(t);
10614+
}
10615+
return true;
10616+
}
10617+
10618+
static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
10619+
{
10620+
struct mm_struct *mm = current->mm;
10621+
struct task_struct *p, *t;
10622+
unsigned int users;
10623+
10624+
/*
10625+
* This can obviously race with a concurrent affinity change, which
10626+
* increases the number of allowed CPUs for this mm, but that does
10627+
* not affect the mode and only changes the CID constraints. A
10628+
* possible switch back to per task mode happens either in the
10629+
* deferred handler function or in the next fork()/exit().
10630+
*
10631+
* The caller has already transferred. The newly incoming task is
10632+
* already accounted for, but not yet visible.
10633+
*/
10634+
users = mm->mm_cid.users - 2;
10635+
if (!users)
10636+
return;
10637+
10638+
guard(rcu)();
10639+
for_other_threads(current, t) {
10640+
if (mm_cid_fixup_task_to_cpu(t, mm))
10641+
users--;
10642+
}
10643+
10644+
if (!users)
10645+
return;
10646+
10647+
/* Happens only for VM_CLONE processes. */
10648+
for_each_process_thread(p, t) {
10649+
if (t == current || t->mm != mm)
10650+
continue;
10651+
if (mm_cid_fixup_task_to_cpu(t, mm)) {
10652+
if (--users == 0)
10653+
return;
10654+
}
10655+
}
10656+
}
10657+
10658+
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
10659+
{
10660+
t->mm_cid.active = 1;
10661+
mm->mm_cid.users++;
10662+
return mm_update_max_cids(mm);
1043610663
}
1043710664

1043810665
void sched_mm_cid_fork(struct task_struct *t)
@@ -10442,12 +10669,19 @@ void sched_mm_cid_fork(struct task_struct *t)
1044210669
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
1044310670

1044410671
guard(mutex)(&mm->mm_cid.mutex);
10445-
guard(raw_spinlock)(&mm->mm_cid.lock);
10446-
t->mm_cid.active = 1;
10447-
mm->mm_cid.users++;
10448-
/* Preset last_cid for mm_cid_select() */
10449-
t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1;
10450-
mm_update_max_cids(mm);
10672+
scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
10673+
sched_mm_cid_add_user(t, mm);
10674+
/* Preset last_cid for mm_cid_select() */
10675+
t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
10676+
}
10677+
}
10678+
10679+
static bool sched_mm_cid_remove_user(struct task_struct *t)
10680+
{
10681+
t->mm_cid.active = 0;
10682+
mm_unset_cid_on_task(t);
10683+
t->mm->mm_cid.users--;
10684+
return mm_update_max_cids(t->mm);
1045110685
}
1045210686

1045310687
/*
@@ -10462,14 +10696,8 @@ void sched_mm_cid_exit(struct task_struct *t)
1046210696
return;
1046310697

1046410698
guard(mutex)(&mm->mm_cid.mutex);
10465-
guard(raw_spinlock)(&mm->mm_cid.lock);
10466-
t->mm_cid.active = 0;
10467-
mm->mm_cid.users--;
10468-
if (t->mm_cid.cid != MM_CID_UNSET) {
10469-
clear_bit(t->mm_cid.cid, mm_cidmask(mm));
10470-
t->mm_cid.cid = MM_CID_UNSET;
10471-
}
10472-
mm_update_max_cids(mm);
10699+
scoped_guard(raw_spinlock, &mm->mm_cid.lock)
10700+
sched_mm_cid_remove_user(t);
1047310701
}
1047410702

1047510703
/* Deactivate MM CID allocation across execve() */
@@ -10499,6 +10727,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
1049910727
mm->mm_cid.transit = 0;
1050010728
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
1050110729
mm->mm_cid.users = 0;
10730+
mm->mm_cid.pcpu_thrs = 0;
10731+
mm->mm_cid.update_deferred = 0;
1050210732
raw_spin_lock_init(&mm->mm_cid.lock);
1050310733
mutex_init(&mm->mm_cid.mutex);
1050410734
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);

0 commit comments

Comments
 (0)