Re: Stalls when starting a VSOCK listening socket: soft lockups, RCU stalls, timeout

Peter Zijlstra Fri, 06 Mar 2026 07:25:34 -0800

On Fri, Mar 06, 2026 at 10:57:15AM +0100, Thomas Gleixner wrote:

> I tried with tons of test cases which stress test mmcid with threads and
> failed.


Are some of those in tools/testing/selftests ?

Anyway, I was going over that code, and I noticed that there seems to be
inconsistent locking for mm_mm_cid::pcpu.

There's a bunch of sites that state we need rq->lock for remote access;
but then things like sched_mm_cid_fork() and sched_mm_cid_exit() seem to
think that holding mm_cid->lock is sufficient.

This doesn't make sense to me, but maybe I missed something.

Anyway, I cobbled together the below, and that builds and boots and
passes everything in tools/testing/selftests/rseq.


YMMV

---
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index da5fa6f40294..df2b4629cbfd 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -2,9 +2,12 @@
 #ifndef _LINUX_RSEQ_TYPES_H
 #define _LINUX_RSEQ_TYPES_H
 
+#include <linux/compiler_types.h>
 #include <linux/irq_work_types.h>
 #include <linux/types.h>
 #include <linux/workqueue_types.h>
+#include <linux/mutex.h>
+#include <asm/percpu.h>
 
 #ifdef CONFIG_RSEQ
 struct rseq;
@@ -145,8 +148,14 @@ struct sched_mm_cid {
  *             while a task with a CID is running
  */
 struct mm_cid_pcpu {
-       unsigned int    cid;
-}____cacheline_aligned_in_smp;
+       unsigned int            cid;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * See helpers in kernel/sched/sched.h that convert
+ * from __rq_lockp(rq) to RQ_LOCK.
+ */
+token_context_lock(RQ_LOCK);
 
 /**
  * struct mm_mm_cid - Storage for per MM CID data
@@ -167,7 +176,7 @@ struct mm_cid_pcpu {
  */
 struct mm_mm_cid {
        /* Hotpath read mostly members */
-       struct mm_cid_pcpu      __percpu *pcpu;
+       struct mm_cid_pcpu      __percpu *pcpu __guarded_by(RQ_LOCK);
        unsigned int            mode;
        unsigned int            max_cids;
 
@@ -179,11 +188,11 @@ struct mm_mm_cid {
        struct mutex            mutex;
 
        /* Low frequency modified */
-       unsigned int            nr_cpus_allowed;
-       unsigned int            users;
-       unsigned int            pcpu_thrs;
-       unsigned int            update_deferred;
-}____cacheline_aligned_in_smp;
+       unsigned int            nr_cpus_allowed __guarded_by(&lock);
+       unsigned int            users           __guarded_by(&lock);
+       unsigned int            pcpu_thrs       __guarded_by(&lock);
+       unsigned int            update_deferred __guarded_by(&lock);
+} ____cacheline_aligned_in_smp;
 #else /* CONFIG_SCHED_MM_CID */
 struct mm_mm_cid { };
 struct sched_mm_cid { };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b571e640372..f7c03c9c4fd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5335,7 +5335,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                }
        }
 
-       mm_cid_switch_to(prev, next);
+       mm_cid_switch_to(rq, prev, next);
 
        /*
         * Tell rseq that the task was scheduled in. Must be after
@@ -10511,6 +10511,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, 
int count)
  * fork(), exit() and affinity changes
  */
 static void __mm_update_max_cids(struct mm_mm_cid *mc)
+       __must_hold(&mc->lock)
 {
        unsigned int opt_cids, max_cids;
 
@@ -10523,15 +10524,17 @@ static void __mm_update_max_cids(struct mm_mm_cid *mc)
 }
 
 static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+       __must_hold(&mc->lock)
 {
        unsigned int opt_cids;
 
        opt_cids = min(mc->nr_cpus_allowed, mc->users);
        /* Has to be at least 1 because 0 indicates PCPU mode off */
-       return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
+       return max(min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2), 1);
 }
 
 static bool mm_update_max_cids(struct mm_struct *mm)
+       __must_hold(&mm->mm_cid.lock)
 {
        struct mm_mm_cid *mc = &mm->mm_cid;
        bool percpu = cid_on_cpu(mc->mode);
@@ -10558,6 +10561,7 @@ static bool mm_update_max_cids(struct mm_struct *mm)
                return false;
 
        /* Flip the mode and set the transition flag to bridge the transfer */
+       WARN_ON_ONCE(mc->mode & MM_CID_TRANSIT);
        WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
        /*
         * Order the store against the subsequent fixups so that
@@ -10568,16 +10572,28 @@ static bool mm_update_max_cids(struct mm_struct *mm)
        return true;
 }
 
+/*
+ * Silly helper because we cannot express that mm_mm_cid::users is updated
+ * while holding both mutex and lock and can thus be read while holding
+ * either.
+ */
+static __always_inline unsigned int mm_cid_users(struct mm_struct *mm)
+       __must_hold(&mm->mm_cid.mutex)
+{
+       __assume_ctx_lock(&mm->mm_cid.lock);
+       return mm->mm_cid.users;
+}
+
 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct 
cpumask *affmsk)
 {
        struct cpumask *mm_allowed;
        struct mm_mm_cid *mc;
        unsigned int weight;
 
-       if (!mm || !READ_ONCE(mm->mm_cid.users))
+       if (!mm || !data_race(READ_ONCE(mm->mm_cid.users)))
                return;
        /*
-        * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+        * mm::mm_cid::mm_cpus_allowed is the superset of each thread's
         * allowed CPUs mask which means it can only grow.
         */
        mc = &mm->mm_cid;
@@ -10609,6 +10625,7 @@ static inline void mm_update_cpus_allowed(struct 
mm_struct *mm, const struct cpu
 
 static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int 
mode)
 {
+       WARN_ON_ONCE(!(mm->mm_cid.mode & MM_CID_TRANSIT));
        /*
         * Ensure that the store removing the TRANSIT bit cannot be
         * reordered by the CPU before the fixups have been completed.
@@ -10633,11 +10650,12 @@ static void mm_cid_fixup_cpus_to_tasks(struct 
mm_struct *mm)
 
        /* Walk the CPUs and fixup all stale CIDs */
        for_each_possible_cpu(cpu) {
-               struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
                struct rq *rq = cpu_rq(cpu);
+               struct mm_cid_pcpu *pcp;
 
                /* Remote access to mm::mm_cid::pcpu requires rq_lock */
                guard(rq_lock_irq)(rq);
+               pcp = mm_cid_pcpu(&mm->mm_cid, rq);
                /* Is the CID still owned by the CPU? */
                if (cid_on_cpu(pcp->cid)) {
                        /*
@@ -10675,6 +10693,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct 
*t, struct mm_struct *mm
 {
        /* Remote access to mm::mm_cid::pcpu requires rq_lock */
        guard(task_rq_lock)(t);
+       __assume_ctx_lock(RQ_LOCK);
        /* If the task is not active it is not in the users count */
        if (!t->mm_cid.active)
                return false;
@@ -10689,6 +10708,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct 
*t, struct mm_struct *mm
 }
 
 static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
+       __must_hold(&mm->mm_cid.mutex)
 {
        struct task_struct *p, *t;
        unsigned int users;
@@ -10703,7 +10723,7 @@ static void mm_cid_do_fixup_tasks_to_cpus(struct 
mm_struct *mm)
         * The caller has already transferred. The newly incoming task is
         * already accounted for, but not yet visible.
         */
-       users = mm->mm_cid.users - 2;
+       users = mm_cid_users(mm) - 2;
        if (!users)
                return;
 
@@ -10727,18 +10747,19 @@ static void mm_cid_do_fixup_tasks_to_cpus(struct 
mm_struct *mm)
        }
 }
 
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_fixup_tasks_to_cpus(struct mm_struct *mm)
+       __must_hold(mm->mm_cid.mutex)
 {
-       struct mm_struct *mm = current->mm;
-
        mm_cid_do_fixup_tasks_to_cpus(mm);
        mm_cid_complete_transit(mm, MM_CID_ONCPU);
 }
 
 static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
+       __must_hold(&mm->mm_cid.mutex)
+       __must_hold(&mm->mm_cid.lock)
 {
        t->mm_cid.active = 1;
-       mm->mm_cid.users++;
+       mm->mm_cid.users++; /* mutex && lock */
        return mm_update_max_cids(mm);
 }
 
@@ -10750,8 +10771,9 @@ void sched_mm_cid_fork(struct task_struct *t)
        WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
 
        guard(mutex)(&mm->mm_cid.mutex);
-       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
-               struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+       scoped_guard(rq_lock_irq, this_rq()) {
+               struct mm_cid_pcpu *pcp = mm_cid_pcpu(&mm->mm_cid, this_rq());
+               guard(raw_spinlock)(&mm->mm_cid.lock);
 
                /* First user ? */
                if (!mm->mm_cid.users) {
@@ -10777,7 +10799,7 @@ void sched_mm_cid_fork(struct task_struct *t)
        }
 
        if (percpu) {
-               mm_cid_fixup_tasks_to_cpus();
+               mm_cid_fixup_tasks_to_cpus(mm);
        } else {
                mm_cid_fixup_cpus_to_tasks(mm);
                t->mm_cid.cid = mm_get_cid(mm);
@@ -10785,6 +10807,8 @@ void sched_mm_cid_fork(struct task_struct *t)
 }
 
 static bool sched_mm_cid_remove_user(struct task_struct *t)
+       __must_hold(t->mm->mm_cid.mutex)
+       __must_hold(t->mm->mm_cid.lock)
 {
        t->mm_cid.active = 0;
        scoped_guard(preempt) {
@@ -10792,11 +10816,13 @@ static bool sched_mm_cid_remove_user(struct 
task_struct *t)
                t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
                mm_unset_cid_on_task(t);
        }
-       t->mm->mm_cid.users--;
+       t->mm->mm_cid.users--; /* mutex && lock */
        return mm_update_max_cids(t->mm);
 }
 
 static bool __sched_mm_cid_exit(struct task_struct *t)
+       __must_hold(t->mm->mm_cid.mutex)
+       __must_hold(t->mm->mm_cid.lock)
 {
        struct mm_struct *mm = t->mm;
 
@@ -10837,8 +10863,9 @@ void sched_mm_cid_exit(struct task_struct *t)
         */
        scoped_guard(mutex, &mm->mm_cid.mutex) {
                /* mm_cid::mutex is sufficient to protect mm_cid::users */
-               if (likely(mm->mm_cid.users > 1)) {
-                       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+               if (likely(mm_cid_users(mm) > 1)) {
+                       scoped_guard(rq_lock_irq, this_rq()) {
+                               guard(raw_spinlock)(&mm->mm_cid.lock);
                                if (!__sched_mm_cid_exit(t))
                                        return;
                                /*
@@ -10847,16 +10874,17 @@ void sched_mm_cid_exit(struct task_struct *t)
                                 * TRANSIT bit. If the CID is owned by the CPU
                                 * then drop it.
                                 */
-                               mm_drop_cid_on_cpu(mm, 
this_cpu_ptr(mm->mm_cid.pcpu));
+                               mm_drop_cid_on_cpu(mm, mm_cid_pcpu(&mm->mm_cid, 
this_rq()));
                        }
                        mm_cid_fixup_cpus_to_tasks(mm);
                        return;
                }
                /* Last user */
-               scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+               scoped_guard(rq_lock_irq, this_rq()) {
+                       guard(raw_spinlock)(&mm->mm_cid.lock);
                        /* Required across execve() */
                        if (t == current)
-                               mm_cid_transit_to_task(t, 
this_cpu_ptr(mm->mm_cid.pcpu));
+                               mm_cid_transit_to_task(t, 
mm_cid_pcpu(&mm->mm_cid, this_rq()));
                        /* Ignore mode change. There is nothing to do. */
                        sched_mm_cid_remove_user(t);
                }
@@ -10893,7 +10921,7 @@ static void mm_cid_work_fn(struct work_struct *work)
 
        guard(mutex)(&mm->mm_cid.mutex);
        /* Did the last user task exit already? */
-       if (!mm->mm_cid.users)
+       if (!mm_cid_users(mm))
                return;
 
        scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10924,13 +10952,14 @@ static void mm_cid_irq_work(struct irq_work *work)
 
 void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 {
-       mm->mm_cid.max_cids = 0;
        mm->mm_cid.mode = 0;
-       mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
-       mm->mm_cid.users = 0;
-       mm->mm_cid.pcpu_thrs = 0;
-       mm->mm_cid.update_deferred = 0;
-       raw_spin_lock_init(&mm->mm_cid.lock);
+       mm->mm_cid.max_cids = 0;
+       scoped_guard (raw_spinlock_init, &mm->mm_cid.lock) {
+               mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+               mm->mm_cid.users = 0;
+               mm->mm_cid.pcpu_thrs = 0;
+               mm->mm_cid.update_deferred = 0;
+       }
        mutex_init(&mm->mm_cid.mutex);
        mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
        INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fd36ae390520..8c761822d6b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3870,13 +3870,39 @@ static __always_inline void 
mm_cid_update_task_cid(struct task_struct *t, unsign
        }
 }
 
-static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, 
unsigned int cid)
+/*
+ * Helpers to convert from __rq_lockp(rq) to RQ_LOCK intermediate.
+ */
+static __always_inline
+void mm_cid_update_pcpu_cid(struct rq *rq, struct mm_struct *mm, unsigned int 
cid)
+       __must_hold(__rq_lockp(rq))
 {
+       __assume_ctx_lock(RQ_LOCK);
+       lockdep_assert(rq->cpu == smp_processor_id());
        __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
 }
 
-static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned 
int cpu_cid,
-                                           unsigned int mode)
+static __always_inline
+unsigned int mm_cid_pcpu_cid(struct rq *rq, struct mm_struct *mm)
+       __must_hold(__rq_lockp(rq))
+{
+       __assume_ctx_lock(RQ_LOCK);
+       lockdep_assert(rq->cpu == smp_processor_id());
+       return __this_cpu_read(mm->mm_cid.pcpu->cid);
+}
+
+static __always_inline
+struct mm_cid_pcpu *mm_cid_pcpu(struct mm_mm_cid *mc, struct rq *rq)
+       __must_hold(__rq_lockp(rq))
+
+{
+       __assume_ctx_lock(RQ_LOCK);
+       return per_cpu_ptr(mc->pcpu, rq->cpu);
+}
+
+static __always_inline void mm_cid_from_cpu(struct rq *rq, struct task_struct 
*t,
+                                           unsigned int cpu_cid, unsigned int 
mode)
+       __must_hold(__rq_lockp(rq))
 {
        unsigned int max_cids, tcid = t->mm_cid.cid;
        struct mm_struct *mm = t->mm;
@@ -3906,12 +3932,13 @@ static __always_inline void mm_cid_from_cpu(struct 
task_struct *t, unsigned int
                if (mode & MM_CID_TRANSIT)
                        cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
        }
-       mm_cid_update_pcpu_cid(mm, cpu_cid);
+       mm_cid_update_pcpu_cid(rq, mm, cpu_cid);
        mm_cid_update_task_cid(t, cpu_cid);
 }
 
-static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned 
int cpu_cid,
-                                            unsigned int mode)
+static __always_inline void mm_cid_from_task(struct rq *rq, struct task_struct 
*t,
+                                            unsigned int cpu_cid, unsigned int 
mode)
+       __must_hold(__rq_lockp(rq))
 {
        unsigned int max_cids, tcid = t->mm_cid.cid;
        struct mm_struct *mm = t->mm;
@@ -3920,7 +3947,7 @@ static __always_inline void mm_cid_from_task(struct 
task_struct *t, unsigned int
        /* Optimize for the common case, where both have the ONCPU bit clear */
        if (likely(cid_on_task(tcid | cpu_cid))) {
                if (likely(tcid < max_cids)) {
-                       mm_cid_update_pcpu_cid(mm, tcid);
+                       mm_cid_update_pcpu_cid(rq, mm, tcid);
                        return;
                }
                /* Try to converge into the optimal CID space */
@@ -3929,7 +3956,7 @@ static __always_inline void mm_cid_from_task(struct 
task_struct *t, unsigned int
                /* Hand over or drop the CPU owned CID */
                if (cid_on_cpu(cpu_cid)) {
                        if (cid_on_task(tcid))
-                               mm_drop_cid_on_cpu(mm, 
this_cpu_ptr(mm->mm_cid.pcpu));
+                               mm_drop_cid_on_cpu(mm, mm_cid_pcpu(&mm->mm_cid, 
rq));
                        else
                                tcid = cpu_cid_to_cid(cpu_cid);
                }
@@ -3939,11 +3966,12 @@ static __always_inline void mm_cid_from_task(struct 
task_struct *t, unsigned int
                /* Set the transition mode flag if required */
                tcid |= mode & MM_CID_TRANSIT;
        }
-       mm_cid_update_pcpu_cid(mm, tcid);
+       mm_cid_update_pcpu_cid(rq, mm, tcid);
        mm_cid_update_task_cid(t, tcid);
 }
 
-static __always_inline void mm_cid_schedin(struct task_struct *next)
+static __always_inline void mm_cid_schedin(struct rq *rq, struct task_struct 
*next)
+       __must_hold(__rq_lockp(rq))
 {
        struct mm_struct *mm = next->mm;
        unsigned int cpu_cid, mode;
@@ -3951,15 +3979,16 @@ static __always_inline void mm_cid_schedin(struct 
task_struct *next)
        if (!next->mm_cid.active)
                return;
 
-       cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+       cpu_cid = mm_cid_pcpu_cid(rq, mm);
        mode = READ_ONCE(mm->mm_cid.mode);
        if (likely(!cid_on_cpu(mode)))
-               mm_cid_from_task(next, cpu_cid, mode);
+               mm_cid_from_task(rq, next, cpu_cid, mode);
        else
-               mm_cid_from_cpu(next, cpu_cid, mode);
+               mm_cid_from_cpu(rq, next, cpu_cid, mode);
 }
 
-static __always_inline void mm_cid_schedout(struct task_struct *prev)
+static __always_inline void mm_cid_schedout(struct rq *rq, struct task_struct 
*prev)
+       __must_hold(__rq_lockp(rq))
 {
        struct mm_struct *mm = prev->mm;
        unsigned int mode, cid;
@@ -3980,7 +4009,7 @@ static __always_inline void mm_cid_schedout(struct 
task_struct *prev)
                        cid = cid_to_cpu_cid(cid);
 
                /* Update both so that the next schedule in goes into the fast 
path */
-               mm_cid_update_pcpu_cid(mm, cid);
+               mm_cid_update_pcpu_cid(rq, mm, cid);
                prev->mm_cid.cid = cid;
        } else {
                mm_drop_cid(mm, cid);
@@ -3988,10 +4017,12 @@ static __always_inline void mm_cid_schedout(struct 
task_struct *prev)
        }
 }
 
-static inline void mm_cid_switch_to(struct task_struct *prev, struct 
task_struct *next)
+static inline void mm_cid_switch_to(struct rq *rq, struct task_struct *prev,
+                                   struct task_struct *next)
+       __must_hold(__rq_lockp(rq))
 {
-       mm_cid_schedout(prev);
-       mm_cid_schedin(next);
+       mm_cid_schedout(rq, prev);
+       mm_cid_schedin(rq, next);
 }
 
 #else /* !CONFIG_SCHED_MM_CID: */

Re: Stalls when starting a VSOCK listening socket: soft lockups, RCU stalls, timeout

Reply via email to