[RFC PATCH] sched/proxy_exec: Extend PE blocked_on chain to rwsem write-side

soolaugust Wed, 04 Mar 2026 00:27:06 -0800

From: zhidao su <[email protected]>

Proxy Execution currently tracks blocked_on chains only through
struct mutex. This patch extends the infrastructure to support
rw_semaphore write-side blocking, allowing PE to eliminate priority
inversion where a high-priority writer waits for a low-priority
write lock holder.


Changes:

1. include/linux/sched.h: Generalise blocked_on from struct mutex *
   to void *, and add a 2-bit blocked_on_type field encoding the
   primitive type (BLOCKED_ON_NONE/MUTEX/RWSEM). All existing mutex
   helpers are renamed to _mutex suffix; compatibility wrappers
   preserve the old names so that mutex.c requires no change.
   New __set/clear_task_blocked_on_rwsem() helpers are added
   (void * parameter avoids pulling rwsem.h into sched.h).

2. kernel/locking/rwsem.c: In rwsem_down_write_slowpath(), call
   __set_task_blocked_on_rwsem() after entering the wait queue
   (wait_lock held), re-set it after each schedule() wakeup, and
   clear it on lock acquisition and on signal-interrupted exit.
   Pattern mirrors the existing mutex slowpath.

3. kernel/sched/core.c: find_proxy_task() now dispatches on
   blocked_on_type. The BLOCKED_ON_RWSEM branch acquires
   sem->wait_lock, re-validates blocked_on, then calls
   rwsem_owner() to retrieve the write owner (returns NULL for
   reader-owned sem, which safely terminates the chain). Owner
   validity checks (on_rq, sched_delayed, cpu, migrating) are
   shared between both branches.

4. tools/testing/selftests/sched/proxy_exec_test.c: Add TC-4
   (single-level rwsem write PE) and TC-5 (mixed rwsem->mutex
   chain). TAP plan updated from 3 to 5.

PREEMPT_RT limitation: rwsem is backed by rwbase_rt/rt_mutex under
CONFIG_PREEMPT_RT. The new code paths are not compiled on RT kernels;
on RT blocked_on is never set for rwsem and find_proxy_task()
terminates cleanly at such nodes.

Signed-off-by: zhidao su <[email protected]>
---
 include/linux/sched.h                         | 110 ++-
 kernel/locking/rwsem.c                        |   9 +
 kernel/sched/core.c                           | 106 ++-
 .../testing/selftests/sched/proxy_exec_test.c | 763 ++++++++++++++++++
 4 files changed, 943 insertions(+), 45 deletions(-)
 create mode 100644 tools/testing/selftests/sched/proxy_exec_test.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7b4a980eb2..4bef3618889 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1237,7 +1237,8 @@ struct task_struct {
        struct rt_mutex_waiter          *pi_blocked_on;
 #endif
 
-       struct mutex                    *blocked_on;    /* lock we're blocked 
on */
+       void                            *blocked_on;    /* lock we're blocked 
on */
+       unsigned int                     blocked_on_type : 2; /* enum 
blocked_on_type */
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
        /*
@@ -2178,8 +2179,21 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) 
__must_hold(lock);
        __cond_resched_rwlock_write(lock);                                      
\
 })
 
+/*
+ * Type tag for task_struct::blocked_on. Allows PE chain traversal
+ * to handle different lock primitives (mutex, rwsem write-side).
+ */
+enum blocked_on_type {
+       BLOCKED_ON_NONE  = 0,
+       BLOCKED_ON_MUTEX = 1,
+       BLOCKED_ON_RWSEM = 2,
+};
+
 #ifndef CONFIG_PREEMPT_RT
-static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
+/* --- mutex blocked_on helpers --- */
+
+static inline struct mutex *
+__get_task_blocked_on_mutex(struct task_struct *p)
 {
        struct mutex *m = p->blocked_on;
 
@@ -2188,7 +2202,8 @@ static inline struct mutex *__get_task_blocked_on(struct 
task_struct *p)
        return m;
 }
 
-static inline void __set_task_blocked_on(struct task_struct *p, struct mutex 
*m)
+static inline void
+__set_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
 {
        struct mutex *blocked_on = READ_ONCE(p->blocked_on);
 
@@ -2204,15 +2219,18 @@ static inline void __set_task_blocked_on(struct 
task_struct *p, struct mutex *m)
         */
        WARN_ON_ONCE(blocked_on && blocked_on != m);
        WRITE_ONCE(p->blocked_on, m);
+       p->blocked_on_type = BLOCKED_ON_MUTEX;
 }
 
-static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
+static inline void
+set_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
 {
        guard(raw_spinlock_irqsave)(&m->wait_lock);
-       __set_task_blocked_on(p, m);
+       __set_task_blocked_on_mutex(p, m);
 }
 
-static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex 
*m)
+static inline void
+__clear_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
 {
        if (m) {
                struct mutex *blocked_on = READ_ONCE(p->blocked_on);
@@ -2227,21 +2245,91 @@ static inline void __clear_task_blocked_on(struct 
task_struct *p, struct mutex *
                WARN_ON_ONCE(blocked_on && blocked_on != m);
        }
        WRITE_ONCE(p->blocked_on, NULL);
+       p->blocked_on_type = BLOCKED_ON_NONE;
 }
 
-static inline void clear_task_blocked_on(struct task_struct *p, struct mutex 
*m)
+static inline void
+clear_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
 {
        guard(raw_spinlock_irqsave)(&m->wait_lock);
-       __clear_task_blocked_on(p, m);
+       __clear_task_blocked_on_mutex(p, m);
 }
-#else
-static inline void __clear_task_blocked_on(struct task_struct *p, struct 
rt_mutex *m)
+
+/* Compatibility wrappers — keep mutex.c callers unchanged */
+static inline struct mutex *
+__get_task_blocked_on(struct task_struct *p)
+{
+       return __get_task_blocked_on_mutex(p);
+}
+
+static inline void
+__set_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
+       __set_task_blocked_on_mutex(p, m);
 }
 
-static inline void clear_task_blocked_on(struct task_struct *p, struct 
rt_mutex *m)
+static inline void
+set_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+       set_task_blocked_on_mutex(p, m);
+}
+
+static inline void
+__clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
+       __clear_task_blocked_on_mutex(p, m);
 }
+
+static inline void
+clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+       clear_task_blocked_on_mutex(p, m);
+}
+
+/* --- rwsem write-side blocked_on helpers --- */
+
+/*
+ * __set/clear_task_blocked_on_rwsem: called with sem->wait_lock held.
+ * Uses void* to avoid pulling struct rw_semaphore into sched.h.
+ * Callers (rwsem.c) cast sem to void* before passing.
+ */
+static inline void
+__set_task_blocked_on_rwsem(struct task_struct *p, void *sem)
+{
+       void *blocked_on = READ_ONCE(p->blocked_on);
+
+       WARN_ON_ONCE(!sem);
+       /* The task should only be setting itself as blocked */
+       WARN_ON_ONCE(p != current);
+       WARN_ON_ONCE(blocked_on && blocked_on != sem);
+       WRITE_ONCE(p->blocked_on, sem);
+       p->blocked_on_type = BLOCKED_ON_RWSEM;
+}
+
+static inline void
+__clear_task_blocked_on_rwsem(struct task_struct *p, void *sem)
+{
+       if (sem) {
+               void *blocked_on = READ_ONCE(p->blocked_on);
+
+               WARN_ON_ONCE(blocked_on && blocked_on != sem);
+       }
+       WRITE_ONCE(p->blocked_on, NULL);
+       p->blocked_on_type = BLOCKED_ON_NONE;
+}
+
+#else /* CONFIG_PREEMPT_RT */
+
+static inline void
+__clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
+static inline void
+clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
 #endif /* !CONFIG_PREEMPT_RT */
 
 static __always_inline bool need_resched(void)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 24df4d98f7d..4ef9893a3e4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1154,6 +1154,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int 
state)
 
        if (state == TASK_UNINTERRUPTIBLE)
                hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+       /* PE: mark this task as blocked on the rwsem write lock */
+       __set_task_blocked_on_rwsem(current, sem);
 
        for (;;) {
                if (rwsem_try_write_lock(sem, &waiter)) {
@@ -1187,8 +1189,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int 
state)
                set_current_state(state);
 trylock_again:
                raw_spin_lock_irq(&sem->wait_lock);
+               /* PE: re-set blocked_on after wakeup re-acquires wait_lock */
+               __set_task_blocked_on_rwsem(current, sem);
        }
 
+       /* PE: clear blocked_on — lock acquired, wait_lock still held */
+       __clear_task_blocked_on_rwsem(current, sem);
+
        if (state == TASK_UNINTERRUPTIBLE)
                hung_task_clear_blocker();
 
@@ -1201,6 +1208,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int 
state)
 out_nolock:
        __set_current_state(TASK_RUNNING);
        raw_spin_lock_irq(&sem->wait_lock);
+       /* PE: clear blocked_on on signal-interrupted exit */
+       __clear_task_blocked_on_rwsem(current, sem);
        rwsem_del_wake_waiter(sem, &waiter, &wake_q);
        lockevent_inc(rwsem_wlock_fail);
        trace_contention_end(sem, -EINTR);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc9f17b35e4..d50c8a90908 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -54,6 +54,7 @@
 #include <linux/mmu_context.h>
 #include <linux/mmzone.h>
 #include <linux/mutex_api.h>
+#include <linux/rwsem.h>
 #include <linux/nmi.h>
 #include <linux/nospec.h>
 #include <linux/perf_event_api.h>
@@ -6594,35 +6595,69 @@ find_proxy_task(struct rq *rq, struct task_struct 
*donor, struct rq_flags *rf)
        struct task_struct *owner = NULL;
        int this_cpu = cpu_of(rq);
        struct task_struct *p;
-       struct mutex *mutex;
 
        /* Follow blocked_on chain. */
        for (p = donor; task_is_blocked(p); p = owner) {
-               mutex = p->blocked_on;
+               void *blocked_lock = READ_ONCE(p->blocked_on);
+               enum blocked_on_type btype = p->blocked_on_type;
+
                /* Something changed in the chain, so pick again */
-               if (!mutex)
+               if (!blocked_lock)
                        return NULL;
-               /*
-                * By taking mutex->wait_lock we hold off concurrent 
mutex_unlock()
-                * and ensure @owner sticks around.
-                */
-               guard(raw_spinlock)(&mutex->wait_lock);
 
-               /* Check again that p is blocked with wait_lock held */
-               if (mutex != __get_task_blocked_on(p)) {
+               if (btype == BLOCKED_ON_MUTEX) {
+                       struct mutex *mutex = blocked_lock;
+
                        /*
-                        * Something changed in the blocked_on chain and
-                        * we don't know if only at this level. So, let's
-                        * just bail out completely and let __schedule()
-                        * figure things out (pick_again loop).
+                        * By taking mutex->wait_lock we hold off concurrent
+                        * mutex_unlock() and ensure @owner sticks around.
                         */
-                       return NULL;
-               }
+                       guard(raw_spinlock)(&mutex->wait_lock);
 
-               owner = __mutex_owner(mutex);
-               if (!owner) {
-                       __clear_task_blocked_on(p, mutex);
-                       return p;
+                       /* Check again that p is blocked with wait_lock held */
+                       if (mutex != __get_task_blocked_on(p)) {
+                               /*
+                                * Something changed in the blocked_on chain
+                                * and we don't know if only at this level.
+                                * Bail out and let __schedule() figure things
+                                * out (pick_again loop).
+                                */
+                               return NULL;
+                       }
+
+                       owner = __mutex_owner(mutex);
+                       if (!owner) {
+                               __clear_task_blocked_on(p, mutex);
+                               return p;
+                       }
+               } else if (btype == BLOCKED_ON_RWSEM) {
+                       struct rw_semaphore *sem = blocked_lock;
+
+                       /*
+                        * Take sem->wait_lock to serialise against concurrent
+                        * up_write() and ensure the owner pointer is stable.
+                        */
+                       guard(raw_spinlock)(&sem->wait_lock);
+
+                       /*
+                        * Re-check after acquiring wait_lock: blocked_on
+                        * could have been cleared by a concurrent wakeup.
+                        */
+                       if (sem != READ_ONCE(p->blocked_on))
+                               return NULL;
+
+                       owner = rwsem_owner(sem);
+                       if (!owner) {
+                               /*
+                                * rwsem is reader-owned or has no writer
+                                * owner. Cannot proxy-execute through
+                                * readers; treat as terminal node.
+                                */
+                               return p;
+                       }
+               } else {
+                       /* Unknown blocked_on type — bail */
+                       return NULL;
                }
 
                if (!READ_ONCE(owner->on_rq)) {
@@ -6630,7 +6665,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, 
struct rq_flags *rf)
                         * Owner is off the runqueue; proxy execution cannot
                         * proceed through it. Deactivate the donor so it will
                         * be properly re-enqueued when the owner eventually
-                        * wakes and releases the mutex.
+                        * wakes and releases the lock.
                         */
                        return proxy_deactivate(rq, donor);
                }
@@ -6658,12 +6693,14 @@ find_proxy_task(struct rq *rq, struct task_struct 
*donor, struct rq_flags *rf)
 
                if (task_on_rq_migrating(owner)) {
                        /*
-                        * One of the chain of mutex owners is currently 
migrating to this
-                        * CPU, but has not yet been enqueued because we are 
holding the
-                        * rq lock. As a simple solution, just schedule 
rq->idle to give
-                        * the migration a chance to complete. Much like the 
migrate_task
-                        * case we should end up back in find_proxy_task(), 
this time
-                        * hopefully with all relevant tasks already enqueued.
+                        * One of the chain of lock owners is currently
+                        * migrating to this CPU, but has not yet been
+                        * enqueued because we are holding the rq lock. As a
+                        * simple solution, just schedule rq->idle to give
+                        * the migration a chance to complete. Much like the
+                        * migrate_task case we should end up back in
+                        * find_proxy_task(), this time hopefully with all
+                        * relevant tasks already enqueued.
                         */
                        return proxy_resched_idle(rq);
                }
@@ -6683,8 +6720,8 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, 
struct rq_flags *rf)
                        /*
                         * It's possible we interleave with mutex_unlock like:
                         *
-                        *                              lock(&rq->lock);
-                        *                                find_proxy_task()
+                        *                      lock(&rq->lock);
+                        *                        find_proxy_task()
                         * mutex_unlock()
                         *   lock(&wait_lock);
                         *   donor(owner) = current->blocked_donor;
@@ -6694,13 +6731,14 @@ find_proxy_task(struct rq *rq, struct task_struct 
*donor, struct rq_flags *rf)
                         *     ...
                         *       ttwu_runnable()
                         *         __task_rq_lock()
-                        *                                lock(&wait_lock);
-                        *                                owner == p
+                        *                        lock(&wait_lock);
+                        *                        owner == p
                         *
-                        * Which leaves us to finish the ttwu_runnable() and 
make it go.
+                        * Which leaves us to finish the ttwu_runnable() and
+                        * make it go.
                         *
-                        * So schedule rq->idle so that ttwu_runnable() can get 
the rq
-                        * lock and mark owner as running.
+                        * So schedule rq->idle so that ttwu_runnable() can
+                        * get the rq lock and mark owner as running.
                         */
                        return proxy_resched_idle(rq);
                }
diff --git a/tools/testing/selftests/sched/proxy_exec_test.c 
b/tools/testing/selftests/sched/proxy_exec_test.c
new file mode 100644
index 00000000000..30fc58b9738
--- /dev/null
+++ b/tools/testing/selftests/sched/proxy_exec_test.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Proxy Execution (PE) selftest
+ *
+ * Tests for the sched_proxy_exec feature. Verifies that the kernel
+ * correctly handles RT priority inheritance through proxy execution.
+ *
+ * TC-1: Basic PE activation — low-prio holder releases lock for high-prio
+ *        waiter within expected time bound.
+ * TC-2: Three-level blocked_on chain — PE chains through B->C so that
+ *        A eventually acquires its mutex.
+ * TC-3: PE deactivate path — SIGSTOP/SIGCONT on holder; high-prio thread
+ *        must still acquire the lock within a generous timeout.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+#include <signal.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+static int test_count;
+
+/* ------------------------------------------------------------------ */
+/* Helpers                                                              */
+/* ------------------------------------------------------------------ */
+
+/*
+ * is_proxy_exec_enabled - check whether CONFIG_SCHED_PROXY_EXEC is active
+ *
+ * Try to read /proc/sys/kernel/sched_proxy_exec. If the file exists and
+ * contains a non-zero value the feature is considered enabled. Returns 1
+ * when enabled, 0 otherwise.
+ */
+static int is_proxy_exec_enabled(void)
+{
+       FILE *f;
+       int val = 0;
+
+       f = fopen("/proc/sys/kernel/sched_proxy_exec", "r");
+       if (!f)
+               return 0;
+
+       if (fscanf(f, "%d", &val) != 1)
+               val = 0;
+
+       fclose(f);
+       return val != 0;
+}
+
+/*
+ * set_rt_prio - set the calling thread to SCHED_FIFO at the given priority
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int set_rt_prio(int prio)
+{
+       struct sched_param sp = { .sched_priority = prio };
+
+       if (sched_setscheduler(0, SCHED_FIFO, &sp) != 0)
+               return -1;
+       return 0;
+}
+
+/*
+ * print_result - emit a single TAP result line
+ *
+ * Increments the global test counter and prints either "ok N - name" or
+ * "not ok N - name".
+ */
+static void print_result(const char *name, int pass)
+{
+       ++test_count;
+       if (pass)
+               printf("ok %d - %s\n", test_count, name);
+       else
+               printf("not ok %d - %s\n", test_count, name);
+}
+
+/*
+ * elapsed_ms - compute elapsed wall-clock milliseconds between two
+ * CLOCK_MONOTONIC timestamps.
+ */
+static long elapsed_ms(const struct timespec *start, const struct timespec 
*end)
+{
+       long diff_sec  = (long)(end->tv_sec  - start->tv_sec);
+       long diff_nsec = (long)(end->tv_nsec - start->tv_nsec);
+
+       return diff_sec * 1000L + diff_nsec / 1000000L;
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-1: Basic PE activation                                            */
+/* ------------------------------------------------------------------ */
+
+struct tc1_args {
+       pthread_mutex_t *mutex;
+       int              hold_ms;       /* how long to sleep in critical 
section */
+};
+
+static void *tc1_holder_thread(void *arg)
+{
+       struct tc1_args *a = arg;
+       struct timespec ts = { 0, (long)a->hold_ms * 1000000L };
+
+       /* Become a low-prio RT thread so PE applies. */
+       set_rt_prio(20);
+
+       pthread_mutex_lock(a->mutex);
+       nanosleep(&ts, NULL);
+       pthread_mutex_unlock(a->mutex);
+
+       return NULL;
+}
+
+static void test_basic_pe_activation(void)
+{
+       pthread_t        holder;
+       pthread_mutex_t  mutex = PTHREAD_MUTEX_INITIALIZER;
+       struct tc1_args  args  = { .mutex = &mutex, .hold_ms = 200 };
+       struct timespec  t0, t1;
+       long             ms;
+       int              pass;
+
+       printf("# TC-1: basic PE activation\n");
+
+       /* Spawn the low-prio holder first; let it grab the mutex. */
+       if (pthread_create(&holder, NULL, tc1_holder_thread, &args) != 0) {
+               printf("# TC-1: pthread_create failed: %s\n", strerror(errno));
+               print_result("basic_pe_activation", 0);
+               return;
+       }
+
+       /*
+        * Give the holder a moment to actually lock the mutex before this
+        * (main) thread — soon to be prio 80 — tries to acquire it.
+        */
+       usleep(20000); /* 20 ms */
+
+       /* Raise our own priority so we become the blocked high-prio waiter. */
+       if (set_rt_prio(80) != 0) {
+               printf("# TC-1: set_rt_prio(80) failed: %s\n", strerror(errno));
+               pthread_join(holder, NULL);
+               pthread_mutex_destroy(&mutex);
+               print_result("basic_pe_activation", 0);
+               return;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t0);
+       pthread_mutex_lock(&mutex);
+       clock_gettime(CLOCK_MONOTONIC, &t1);
+       pthread_mutex_unlock(&mutex);
+
+       /* Restore to SCHED_OTHER for the remaining tests. */
+       {
+               struct sched_param sp = { .sched_priority = 0 };
+
+               sched_setscheduler(0, SCHED_OTHER, &sp);
+       }
+
+       pthread_join(holder, NULL);
+       pthread_mutex_destroy(&mutex);
+
+       ms   = elapsed_ms(&t0, &t1);
+       pass = (ms < 300L);
+       printf("# TC-1: acquired mutex in %ld ms (limit 300 ms)\n", ms);
+       print_result("basic_pe_activation", pass);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-2: Three-level blocked_on chain                                   */
+/* ------------------------------------------------------------------ */
+
+struct tc2_shared {
+       pthread_mutex_t mutex1;
+       pthread_mutex_t mutex2;
+
+       /* Synchronisation: holders signal when they have grabbed the lock. */
+       pthread_mutex_t sync_mutex;
+       pthread_cond_t  sync_cond;
+       int             holders_ready; /* incremented by each holder */
+};
+
+struct tc2_b_args {
+       struct tc2_shared *s;
+};
+
+struct tc2_c_args {
+       struct tc2_shared *s;
+       int                hold_ms;
+};
+
+/* Thread C: holds mutex2, sleeps, releases. */
+static void *tc2_c_thread(void *arg)
+{
+       struct tc2_c_args *a = arg;
+       struct tc2_shared *s = a->s;
+       struct timespec    ts = { 0, (long)a->hold_ms * 1000000L };
+
+       set_rt_prio(20);
+
+       pthread_mutex_lock(&s->mutex2);
+
+       /* Signal that we are ready. */
+       pthread_mutex_lock(&s->sync_mutex);
+       s->holders_ready++;
+       pthread_cond_broadcast(&s->sync_cond);
+       pthread_mutex_unlock(&s->sync_mutex);
+
+       nanosleep(&ts, NULL);
+       pthread_mutex_unlock(&s->mutex2);
+
+       return NULL;
+}
+
+/* Thread B: holds mutex1, then blocks on mutex2. */
+static void *tc2_b_thread(void *arg)
+{
+       struct tc2_b_args *a = arg;
+       struct tc2_shared *s = a->s;
+
+       set_rt_prio(50);
+
+       pthread_mutex_lock(&s->mutex1);
+
+       /* Signal that we are ready. */
+       pthread_mutex_lock(&s->sync_mutex);
+       s->holders_ready++;
+       pthread_cond_broadcast(&s->sync_cond);
+       pthread_mutex_unlock(&s->sync_mutex);
+
+       /* Now block on C. */
+       pthread_mutex_lock(&s->mutex2);
+       pthread_mutex_unlock(&s->mutex2);
+
+       pthread_mutex_unlock(&s->mutex1);
+
+       return NULL;
+}
+
+static void test_three_level_chain(void)
+{
+       struct tc2_shared  shared;
+       struct tc2_b_args  b_args;
+       struct tc2_c_args  c_args;
+       pthread_t          b_tid, c_tid;
+       struct timespec    t0, t1;
+       long               ms;
+       int                pass;
+
+       printf("# TC-2: three-level blocked_on chain\n");
+
+       memset(&shared, 0, sizeof(shared));
+       pthread_mutex_init(&shared.mutex1, NULL);
+       pthread_mutex_init(&shared.mutex2, NULL);
+       pthread_mutex_init(&shared.sync_mutex, NULL);
+       pthread_cond_init(&shared.sync_cond, NULL);
+       shared.holders_ready = 0;
+
+       c_args.s       = &shared;
+       c_args.hold_ms = 100;
+       b_args.s       = &shared;
+
+       /* Start C first so it grabs mutex2 before B tries. */
+       if (pthread_create(&c_tid, NULL, tc2_c_thread, &c_args) != 0) {
+               printf("# TC-2: pthread_create C failed\n");
+               goto cleanup;
+       }
+
+       /* Wait until C holds mutex2. */
+       pthread_mutex_lock(&shared.sync_mutex);
+       while (shared.holders_ready < 1)
+               pthread_cond_wait(&shared.sync_cond, &shared.sync_mutex);
+       pthread_mutex_unlock(&shared.sync_mutex);
+
+       /* Now start B so it grabs mutex1 then blocks on mutex2. */
+       if (pthread_create(&b_tid, NULL, tc2_b_thread, &b_args) != 0) {
+               printf("# TC-2: pthread_create B failed\n");
+               pthread_join(c_tid, NULL);
+               goto cleanup;
+       }
+
+       /* Wait until B holds mutex1. */
+       pthread_mutex_lock(&shared.sync_mutex);
+       while (shared.holders_ready < 2)
+               pthread_cond_wait(&shared.sync_cond, &shared.sync_mutex);
+       pthread_mutex_unlock(&shared.sync_mutex);
+
+       /* Small delay to let B actually block on mutex2. */
+       usleep(10000); /* 10 ms */
+
+       /* Raise our (A's) priority and try to acquire mutex1. */
+       if (set_rt_prio(80) != 0) {
+               printf("# TC-2: set_rt_prio(80) failed: %s\n", strerror(errno));
+               pthread_join(b_tid, NULL);
+               pthread_join(c_tid, NULL);
+               goto cleanup;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t0);
+       pthread_mutex_lock(&shared.mutex1);
+       clock_gettime(CLOCK_MONOTONIC, &t1);
+       pthread_mutex_unlock(&shared.mutex1);
+
+       /* Restore scheduling. */
+       {
+               struct sched_param sp = { .sched_priority = 0 };
+
+               sched_setscheduler(0, SCHED_OTHER, &sp);
+       }
+
+       pthread_join(b_tid, NULL);
+       pthread_join(c_tid, NULL);
+
+       ms   = elapsed_ms(&t0, &t1);
+       pass = (ms < 200L);
+       printf("# TC-2: acquired mutex1 in %ld ms (limit 200 ms)\n", ms);
+       print_result("three_level_chain", pass);
+
+cleanup:
+       pthread_mutex_destroy(&shared.mutex1);
+       pthread_mutex_destroy(&shared.mutex2);
+       pthread_mutex_destroy(&shared.sync_mutex);
+       pthread_cond_destroy(&shared.sync_cond);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-3: PE deactivate path (owner SIGSTOP)                             */
+/* ------------------------------------------------------------------ */
+
+/* Flag set by SIGALRM handler so we can detect timeout. */
+static volatile sig_atomic_t tc3_alarm_fired;
+
+static void tc3_alarm_handler(int sig)
+{
+       (void)sig;
+       tc3_alarm_fired = 1;
+}
+
+struct tc3_args {
+       pthread_mutex_t *mutex;
+       /* tid for SIGSTOP/SIGCONT from the main thread */
+       pid_t            tid;
+       pthread_mutex_t  ready_mutex;
+       pthread_cond_t   ready_cond;
+       int              ready;
+};
+
+static void *tc3_holder_thread(void *arg)
+{
+       struct tc3_args *a = arg;
+       struct timespec  ts = { 0, 50L * 1000000L }; /* 50 ms sleep */
+
+       set_rt_prio(20);
+
+       /* Record our own tid so main can send signals. */
+       a->tid = (pid_t)syscall(SYS_gettid);
+
+       pthread_mutex_lock(a->mutex);
+
+       /* Signal main that we hold the mutex. */
+       pthread_mutex_lock(&a->ready_mutex);
+       a->ready = 1;
+       pthread_cond_signal(&a->ready_cond);
+       pthread_mutex_unlock(&a->ready_mutex);
+
+       /*
+        * Sleep inside the critical section.  The main thread will SIGSTOP
+        * us while we are in here.
+        */
+       nanosleep(&ts, NULL);
+
+       pthread_mutex_unlock(a->mutex);
+
+       return NULL;
+}
+
+/* Arguments for the SIGCONT helper thread. */
+struct tc3_cont_args {
+       pid_t tid;
+       pid_t pid;
+};
+
+/*
+ * tc3_cont_thread - sleep 1 second then send SIGCONT to the stopped holder.
+ */
+static void *tc3_cont_thread(void *arg)
+{
+       struct tc3_cont_args *ca = arg;
+       struct timespec       ts = { 1, 0 }; /* 1 second */
+
+       nanosleep(&ts, NULL);
+       syscall(SYS_tgkill, ca->pid, ca->tid, SIGCONT);
+       return NULL;
+}
+
+static void test_pe_deactivate_sigstop(void)
+{
+       pthread_t            holder;
+       pthread_t            cont_tid;
+       pthread_mutex_t      mutex = PTHREAD_MUTEX_INITIALIZER;
+       struct tc3_args      args;
+       struct tc3_cont_args cont_args;
+       struct sigaction     sa_alarm, sa_old;
+       struct timespec      t0, t1;
+       long                 ms;
+       int                  pass = 0;
+
+       printf("# TC-3: PE deactivate path (owner SIGSTOP/SIGCONT)\n");
+
+       memset(&args, 0, sizeof(args));
+       args.mutex = &mutex;
+       pthread_mutex_init(&args.ready_mutex, NULL);
+       pthread_cond_init(&args.ready_cond, NULL);
+
+       /* Install SIGALRM handler. */
+       memset(&sa_alarm, 0, sizeof(sa_alarm));
+       sa_alarm.sa_handler = tc3_alarm_handler;
+       sigemptyset(&sa_alarm.sa_mask);
+       sa_alarm.sa_flags = 0;
+       sigaction(SIGALRM, &sa_alarm, &sa_old);
+       tc3_alarm_fired = 0;
+
+       if (pthread_create(&holder, NULL, tc3_holder_thread, &args) != 0) {
+               printf("# TC-3: pthread_create holder failed: %s\n",
+                      strerror(errno));
+               goto cleanup_sig;
+       }
+
+       /* Wait until the holder has the mutex. */
+       pthread_mutex_lock(&args.ready_mutex);
+       while (!args.ready)
+               pthread_cond_wait(&args.ready_cond, &args.ready_mutex);
+       pthread_mutex_unlock(&args.ready_mutex);
+
+       /* SIGSTOP the holder — it is now off the run queue. */
+       syscall(SYS_tgkill, getpid(), args.tid, SIGSTOP);
+
+       /* Raise our priority so we are the high-prio blocked waiter. */
+       if (set_rt_prio(80) != 0) {
+               printf("# TC-3: set_rt_prio(80) failed: %s\n", strerror(errno));
+               syscall(SYS_tgkill, getpid(), args.tid, SIGCONT);
+               pthread_join(holder, NULL);
+               goto cleanup_sig;
+       }
+
+       /* Spawn the SIGCONT helper before blocking; it fires after 1 second. */
+       cont_args.tid = args.tid;
+       cont_args.pid = getpid();
+
+       if (pthread_create(&cont_tid, NULL, tc3_cont_thread, &cont_args) != 0) {
+               printf("# TC-3: pthread_create cont failed: %s\n",
+                      strerror(errno));
+               syscall(SYS_tgkill, getpid(), args.tid, SIGCONT);
+               pthread_join(holder, NULL);
+               goto cleanup_prio;
+       }
+
+       /* Set a 5-second alarm as overall watchdog. */
+       alarm(5);
+
+       clock_gettime(CLOCK_MONOTONIC, &t0);
+       pthread_mutex_lock(&mutex);
+       clock_gettime(CLOCK_MONOTONIC, &t1);
+       pthread_mutex_unlock(&mutex);
+
+       alarm(0); /* cancel watchdog */
+
+       pthread_join(cont_tid, NULL);
+
+       ms   = elapsed_ms(&t0, &t1);
+       pass = (!tc3_alarm_fired && ms < 5000L);
+       printf("# TC-3: acquired mutex in %ld ms (limit 5000 ms, alarm=%d)\n",
+              ms, (int)tc3_alarm_fired);
+       print_result("pe_deactivate_sigstop", pass);
+
+cleanup_prio:
+       /* Restore scheduling. */
+       {
+               struct sched_param sp = { .sched_priority = 0 };
+
+               sched_setscheduler(0, SCHED_OTHER, &sp);
+       }
+
+       pthread_join(holder, NULL);
+
+cleanup_sig:
+       sigaction(SIGALRM, &sa_old, NULL);
+       pthread_mutex_destroy(&mutex);
+       pthread_mutex_destroy(&args.ready_mutex);
+       pthread_cond_destroy(&args.ready_cond);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-4: rwsem write-side PE — basic activation                        */
+/* ------------------------------------------------------------------ */
+
+/*
+ * NOTE: This test exercises the rwsem PE integration added by the
+ * proxy-exec-rwsem-support patch.  It uses pthread_rwlock_t as the
+ * user-space analogue of the kernel rw_semaphore.  The kernel PE path
+ * (blocked_on tracking in rwsem_down_write_slowpath) ensures that when
+ * a high-priority writer is blocked on an rwsem held by a low-priority
+ * writer, the holder is proxy-executed at the waiter's priority.
+ *
+ * The timing assertion (< 300 ms) is the same as TC-1: with PE the
+ * holder finishes its 200 ms critical section and releases the lock
+ * within the window.
+ */
+
+struct tc4_args {
+       pthread_rwlock_t *rwlock;
+       int               hold_ms;
+};
+
+static void *tc4_holder_thread(void *arg)
+{
+       struct tc4_args *a = arg;
+       struct timespec  ts = { 0, (long)a->hold_ms * 1000000L };
+
+       /* Become a low-prio RT thread so PE applies. */
+       set_rt_prio(20);
+
+       pthread_rwlock_wrlock(a->rwlock);
+       nanosleep(&ts, NULL);
+       pthread_rwlock_unlock(a->rwlock);
+
+       return NULL;
+}
+
+static void test_rwsem_write_pe_basic(void)
+{
+       pthread_t        holder;
+       pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
+       struct tc4_args  args   = { .rwlock = &rwlock, .hold_ms = 200 };
+       struct timespec  t0, t1;
+       long             ms;
+       int              pass;
+
+       printf("# TC-4: rwsem write-side PE activation\n");
+
+       if (pthread_create(&holder, NULL, tc4_holder_thread, &args) != 0) {
+               printf("# TC-4: pthread_create failed: %s\n",
+                      strerror(errno));
+               print_result("rwsem_write_pe_basic", 0);
+               return;
+       }
+
+       /* Give the holder time to acquire the write lock. */
+       usleep(20000); /* 20 ms */
+
+       /* Raise to high prio — we become the blocked writer. */
+       if (set_rt_prio(80) != 0) {
+               printf("# TC-4: set_rt_prio(80) failed: %s\n",
+                      strerror(errno));
+               pthread_join(holder, NULL);
+               pthread_rwlock_destroy(&rwlock);
+               print_result("rwsem_write_pe_basic", 0);
+               return;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t0);
+       pthread_rwlock_wrlock(&rwlock);
+       clock_gettime(CLOCK_MONOTONIC, &t1);
+       pthread_rwlock_unlock(&rwlock);
+
+       {
+               struct sched_param sp = { .sched_priority = 0 };
+
+               sched_setscheduler(0, SCHED_OTHER, &sp);
+       }
+
+       pthread_join(holder, NULL);
+       pthread_rwlock_destroy(&rwlock);
+
+       ms   = elapsed_ms(&t0, &t1);
+       pass = (ms < 300L);
+       printf("# TC-4: acquired write lock in %ld ms (limit 300 ms)\n",
+              ms);
+       print_result("rwsem_write_pe_basic", pass);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-5: Mixed chain — rwsem write → mutex                             */
+/* ------------------------------------------------------------------ */
+
+/*
+ * Chain layout:
+ *   A (prio=80) waiting for rwlock write
+ *   B (prio=50) holds rwlock write, waiting for mutex
+ *   C (prio=20) holds mutex, sleeping 300 ms
+ *
+ * With rwsem PE support the chain A→rwlock→B→mutex→C is fully
+ * traversed.  A should acquire the rwlock within 500 ms.
+ */
+
+struct tc5_shared {
+       pthread_rwlock_t rwlock;
+       pthread_mutex_t  mutex;
+
+       pthread_mutex_t  sync_mu;
+       pthread_cond_t   sync_cv;
+       int              ready; /* incremented by B and C when ready */
+};
+
+struct tc5_b_args { struct tc5_shared *s; };
+struct tc5_c_args { struct tc5_shared *s; int hold_ms; };
+
+static void *tc5_c_thread(void *arg)
+{
+       struct tc5_c_args *a = arg;
+       struct tc5_shared *s = a->s;
+       struct timespec    ts = { 0, (long)a->hold_ms * 1000000L };
+
+       set_rt_prio(20);
+
+       pthread_mutex_lock(&s->mutex);
+
+       pthread_mutex_lock(&s->sync_mu);
+       s->ready++;
+       pthread_cond_signal(&s->sync_cv);
+       pthread_mutex_unlock(&s->sync_mu);
+
+       nanosleep(&ts, NULL);
+       pthread_mutex_unlock(&s->mutex);
+       return NULL;
+}
+
+static void *tc5_b_thread(void *arg)
+{
+       struct tc5_b_args *a = arg;
+       struct tc5_shared *s = a->s;
+
+       set_rt_prio(50);
+
+       pthread_rwlock_wrlock(&s->rwlock);
+
+       pthread_mutex_lock(&s->sync_mu);
+       s->ready++;
+       pthread_cond_signal(&s->sync_cv);
+       pthread_mutex_unlock(&s->sync_mu);
+
+       /* Block on the mutex — this is the middle of the PE chain. */
+       pthread_mutex_lock(&s->mutex);
+       pthread_mutex_unlock(&s->mutex);
+
+       pthread_rwlock_unlock(&s->rwlock);
+       return NULL;
+}
+
+static void test_rwsem_mutex_chain(void)
+{
+       pthread_t         tb, tc;
+       struct tc5_shared s = {
+               .rwlock  = PTHREAD_RWLOCK_INITIALIZER,
+               .mutex   = PTHREAD_MUTEX_INITIALIZER,
+               .sync_mu = PTHREAD_MUTEX_INITIALIZER,
+               .sync_cv = PTHREAD_COND_INITIALIZER,
+               .ready   = 0,
+       };
+       struct tc5_b_args bargs = { .s = &s };
+       struct tc5_c_args cargs = { .s = &s, .hold_ms = 300 };
+       struct timespec   t0, t1;
+       long              ms;
+       int               pass;
+
+       printf("# TC-5: mixed rwsem-write -> mutex PE chain\n");
+
+       if (pthread_create(&tc, NULL, tc5_c_thread, &cargs) != 0 ||
+           pthread_create(&tb, NULL, tc5_b_thread, &bargs) != 0) {
+               printf("# TC-5: pthread_create failed: %s\n",
+                      strerror(errno));
+               print_result("rwsem_mutex_chain", 0);
+               return;
+       }
+
+       /* Wait until both B and C have grabbed their locks. */
+       pthread_mutex_lock(&s.sync_mu);
+       while (s.ready < 2)
+               pthread_cond_wait(&s.sync_cv, &s.sync_mu);
+       pthread_mutex_unlock(&s.sync_mu);
+
+       if (set_rt_prio(80) != 0) {
+               printf("# TC-5: set_rt_prio(80) failed: %s\n",
+                      strerror(errno));
+               pthread_join(tb, NULL);
+               pthread_join(tc, NULL);
+               print_result("rwsem_mutex_chain", 0);
+               return;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t0);
+       pthread_rwlock_wrlock(&s.rwlock);
+       clock_gettime(CLOCK_MONOTONIC, &t1);
+       pthread_rwlock_unlock(&s.rwlock);
+
+       {
+               struct sched_param sp = { .sched_priority = 0 };
+
+               sched_setscheduler(0, SCHED_OTHER, &sp);
+       }
+
+       pthread_join(tb, NULL);
+       pthread_join(tc, NULL);
+       pthread_rwlock_destroy(&s.rwlock);
+       pthread_mutex_destroy(&s.mutex);
+       pthread_mutex_destroy(&s.sync_mu);
+       pthread_cond_destroy(&s.sync_cv);
+
+       ms   = elapsed_ms(&t0, &t1);
+       pass = (ms < 500L);
+       printf("# TC-5: acquired write lock in %ld ms (limit 500 ms)\n",
+              ms);
+       print_result("rwsem_mutex_chain", pass);
+}
+
+/* ------------------------------------------------------------------ */
+/* main                                                                 */
+/* ------------------------------------------------------------------ */
+
+int main(void)
+{
+       struct sched_param sp = { .sched_priority = 1 };
+
+       /*
+        * Capability check: attempt to raise to SCHED_FIFO prio 1.  A plain
+        * EPERM means we lack CAP_SYS_NICE; skip gracefully in that case.
+        */
+       if (sched_setscheduler(0, SCHED_FIFO, &sp) != 0) {
+               if (errno == EPERM) {
+                       printf("1..0 # SKIP: requires CAP_SYS_NICE\n");
+                       return 0;
+               }
+               /* Unexpected error — restore SCHED_OTHER and continue. */
+       } else {
+               /* Restore normal scheduling before running tests. */
+               sp.sched_priority = 0;
+               sched_setscheduler(0, SCHED_OTHER, &sp);
+       }
+
+       if (!is_proxy_exec_enabled()) {
+               printf("1..0 # SKIP: CONFIG_SCHED_PROXY_EXEC not enabled\n");
+               return 0;
+       }
+
+       /* TAP plan: five test cases (3 original + 2 rwsem) */
+       printf("1..5\n");
+
+       test_basic_pe_activation();
+       test_three_level_chain();
+       test_pe_deactivate_sigstop();
+       test_rwsem_write_pe_basic();
+       test_rwsem_mutex_chain();
+
+       return 0;
+}
-- 
2.43.0

[RFC PATCH] sched/proxy_exec: Extend PE blocked_on chain to rwsem write-side

Reply via email to