Skip to content

Commit 9da6ccb

Browse files
committed
sched/mmcid: Implement deferred mode change
When affinity changes cause an increase of the number of CPUs allowed for tasks which are related to a MM, that might results in a situation where the ownership mode can go back from per CPU mode to per task mode. As affinity changes happen with runqueue lock held there is no way to do the actual mode change and required fixup right there. Add the infrastructure to defer it to a workqueue. The scheduled work can race with a fork() or exit(). Whatever happens first takes care of it. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20251119172550.216484739@linutronix.de
1 parent c809f08 commit 9da6ccb

File tree

2 files changed

+59
-7
lines changed

2 files changed

+59
-7
lines changed

include/linux/rseq_types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
#ifndef _LINUX_RSEQ_TYPES_H
33
#define _LINUX_RSEQ_TYPES_H
44

5+
#include <linux/irq_work_types.h>
56
#include <linux/types.h>
7+
#include <linux/workqueue_types.h>
68

79
#ifdef CONFIG_RSEQ
810
struct rseq;
@@ -122,6 +124,8 @@ struct mm_cid_pcpu {
122124
* @percpu: Set, when CIDs are in per CPU mode
123125
* @transit: Set to MM_CID_TRANSIT during a mode change transition phase
124126
* @max_cids: The exclusive maximum CID value for allocation and convergence
127+
* @irq_work: irq_work to handle the affinity mode change case
128+
* @work: Regular work to handle the affinity mode change case
125129
* @lock: Spinlock to protect against affinity setting which can't take @mutex
126130
* @mutex: Mutex to serialize forks and exits related to this mm
127131
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
@@ -139,6 +143,10 @@ struct mm_mm_cid {
139143
unsigned int transit;
140144
unsigned int max_cids;
141145

146+
/* Rarely used. Moves @lock and @mutex into the second cacheline */
147+
struct irq_work irq_work;
148+
struct work_struct work;
149+
142150
raw_spinlock_t lock;
143151
struct mutex mutex;
144152

kernel/sched/core.c

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10539,8 +10539,17 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
1053910539

1054010540
/* Adjust the threshold to the wider set */
1054110541
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
10542+
/* Switch back to per task mode? */
10543+
if (mc->users >= mc->pcpu_thrs)
10544+
return;
10545+
10546+
/* Don't queue twice */
10547+
if (mc->update_deferred)
10548+
return;
1054210549

10543-
/* Scheduling of deferred mode switch goes here */
10550+
/* Queue the irq work, which schedules the real work */
10551+
mc->update_deferred = true;
10552+
irq_work_queue(&mc->irq_work);
1054410553
}
1054510554

1054610555
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10553,7 +10562,7 @@ static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_p
1055310562
}
1055410563
}
1055510564

10556-
static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
10565+
static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
1055710566
{
1055810567
unsigned int cpu;
1055910568

@@ -10714,14 +10723,47 @@ void sched_mm_cid_after_execve(struct task_struct *t)
1071410723
mm_cid_select(t);
1071510724
}
1071610725

10717-
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
10726+
static void mm_cid_work_fn(struct work_struct *work)
1071810727
{
10719-
struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
10720-
int cpu;
10728+
struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
1072110729

10722-
for_each_possible_cpu(cpu)
10723-
per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
10730+
/* Make it compile, but not functional yet */
10731+
if (!IS_ENABLED(CONFIG_NEW_MM_CID))
10732+
return;
10733+
10734+
guard(mutex)(&mm->mm_cid.mutex);
10735+
/* Did the last user task exit already? */
10736+
if (!mm->mm_cid.users)
10737+
return;
10738+
10739+
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
10740+
/* Have fork() or exit() handled it already? */
10741+
if (!mm->mm_cid.update_deferred)
10742+
return;
10743+
/* This clears mm_cid::update_deferred */
10744+
if (!mm_update_max_cids(mm))
10745+
return;
10746+
/* Affinity changes can only switch back to task mode */
10747+
if (WARN_ON_ONCE(mm->mm_cid.percpu))
10748+
return;
10749+
}
10750+
mm_cid_fixup_cpus_to_tasks(mm);
10751+
}
10752+
10753+
static void mm_cid_irq_work(struct irq_work *work)
10754+
{
10755+
struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
1072410756

10757+
/*
10758+
* Needs to be unconditional because mm_cid::lock cannot be held
10759+
* when scheduling work as mm_update_cpus_allowed() nests inside
10760+
* rq::lock and schedule_work() might end up in wakeup...
10761+
*/
10762+
schedule_work(&mm->mm_cid.work);
10763+
}
10764+
10765+
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
10766+
{
1072510767
mm->mm_cid.max_cids = 0;
1072610768
mm->mm_cid.percpu = 0;
1072710769
mm->mm_cid.transit = 0;
@@ -10731,6 +10773,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
1073110773
mm->mm_cid.update_deferred = 0;
1073210774
raw_spin_lock_init(&mm->mm_cid.lock);
1073310775
mutex_init(&mm->mm_cid.mutex);
10776+
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
10777+
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
1073410778
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
1073510779
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
1073610780
}

0 commit comments

Comments
 (0)