From: Peter Oskolkov <p...@google.com>

This is an RFC!
As Paul Turner presented at LPC in 2013 ...
- pdf: 
http://pdxplumbers.osuosl.org/2013/ocw//system/presentations/1653/original/LPC%20-%20User%20Threading.pdf
- video: https://www.youtube.com/watch?v=KXuZi9aeGTw

... Google has developed an M:N userspace threading subsystem backed
by Google-private SwitchTo Linux Kernel API (page 17 in the pdf referenced
above). This subsystem provides latency-sensitive services at Google with
fine-grained user-space control/scheduling over what is running when,
and this subsystem is used widely internally (called schedulers or fibers).

This RFC patchset is the first step to open-source this work. As explained
in the linked pdf and video, SwitchTo API has three core operations: wait,
resume, and swap (=switch). So this patchset adds a FUTEX_SWAP operation
that, in addition to FUTEX_WAIT and FUTEX_WAKE, will provide a foundation
on top of which user-space threading libraries can be built.

Another common use case for FUTEX_SWAP is message passing a-la RPC
between tasks: task/thread T1 prepares a message,
wakes T2 to work on it, and waits for the results; when T2 is done, it
wakes T1 and waits for more work to arrive. Currently the simplest
way to implement this is

a. T1: futex-wake T2, futex-wait
b. T2: wakes, does what it has been woken to do
c. T2: futex-wake T1, futex-wait

With FUTEX_SWAP, steps a and c above can be reduced to one futex operation
that runs 5-10 times faster.

Patches in this patchset:

Patch 1: (this patch) introduce FUTEX_SWAP futex operation that,
         internally, does wake + wait. The purpose of this patch is
         to work out the API.
Patch 2: a first rough attempt to make FUTEX_SWAP faster than
         what wake + wait can do.
Patch 3: a selftest that can also be used to benchmark FUTEX_SWAP vs
         FUTEX_WAKE + FUTEX_WAIT.

Tested: see patch 3 in this patchset.

Signed-off-by: Peter Oskolkov <p...@google.com>
---
 include/uapi/linux/futex.h |  2 +
 kernel/futex.c             | 97 +++++++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..c1d151d97dea 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -21,6 +21,7 @@
 #define FUTEX_WAKE_BITSET      10
 #define FUTEX_WAIT_REQUEUE_PI  11
 #define FUTEX_CMP_REQUEUE_PI   12
+#define FUTEX_SWAP             13

 #define FUTEX_PRIVATE_FLAG     128
 #define FUTEX_CLOCK_REALTIME   256
@@ -40,6 +41,7 @@
                                         FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE   (FUTEX_CMP_REQUEUE_PI | \
                                         FUTEX_PRIVATE_FLAG)
+#define FUTEX_SWAP_PRIVATE             (FUTEX_SWAP | FUTEX_PRIVATE_FLAG)

 /*
  * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/kernel/futex.c b/kernel/futex.c
index e646661f6282..670d6d113561 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1595,16 +1595,16 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct 
futex_hash_bucket *hb2)
 }

 /*
- * Wake up waiters matching bitset queued on this futex (uaddr).
+ * Prepare wake queue matching bitset queued on this futex (uaddr).
  */
 static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+prepare_wake_q(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset,
+              struct wake_q_head *wake_q)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        union futex_key key = FUTEX_KEY_INIT;
        int ret;
-       DEFINE_WAKE_Q(wake_q);

        if (!bitset)
                return -EINVAL;
@@ -1632,20 +1632,34 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int 
nr_wake, u32 bitset)
                        if (!(this->bitset & bitset))
                                continue;

-                       mark_wake_futex(&wake_q, this);
+                       mark_wake_futex(wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }

        spin_unlock(&hb->lock);
-       wake_up_q(&wake_q);
 out_put_key:
        put_futex_key(&key);
 out:
        return ret;
 }

+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+       int ret;
+       DEFINE_WAKE_Q(wake_q);
+
+       ret = prepare_wake_q(uaddr, flags, nr_wake, bitset, &wake_q);
+       wake_up_q(&wake_q);
+
+       return ret;
+}
+
 static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 {
        unsigned int op =         (encoded_op & 0x70000000) >> 28;
@@ -2603,9 +2617,12 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q 
*q, int locked)
  * @hb:                the futex hash bucket, must be locked by the caller
  * @q:         the futex_q to queue up on
  * @timeout:   the prepared hrtimer_sleeper, or null for no timeout
+ * @next:      if present, wake next and hint to the scheduler that we'd
+ *             prefer to execute it locally.
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q 
*q,
-                               struct hrtimer_sleeper *timeout)
+                               struct hrtimer_sleeper *timeout,
+                               struct task_struct *next)
 {
        /*
         * The task state is guaranteed to be set before another task can
@@ -2630,10 +2647,27 @@ static void futex_wait_queue_me(struct 
futex_hash_bucket *hb, struct futex_q *q,
                 * flagged for rescheduling. Only call schedule if there
                 * is no timeout, or if it has yet to expire.
                 */
-               if (!timeout || timeout->task)
+               if (!timeout || timeout->task) {
+                       if (next) {
+                               /*
+                                * wake_up_process() below will be replaced
+                                * in the next patch with
+                                * wake_up_process_prefer_current_cpu().
+                                */
+                               wake_up_process(next);
+                               put_task_struct(next);
+                               next = NULL;
+                       }
                        freezable_schedule();
+               }
        }
        __set_current_state(TASK_RUNNING);
+
+       if (next) {
+               /* Maybe call wake_up_process_prefer_current_cpu()? */
+               wake_up_process(next);
+               put_task_struct(next);
+       }
 }

 /**
@@ -2713,7 +2747,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, 
unsigned int flags,
 }

 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-                     ktime_t *abs_time, u32 bitset)
+                     ktime_t *abs_time, u32 bitset, struct task_struct *next)
 {
        struct hrtimer_sleeper timeout, *to;
        struct restart_block *restart;
@@ -2737,7 +2771,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int 
flags, u32 val,
                goto out;

        /* queue_me and wait for wakeup, timeout, or a signal. */
-       futex_wait_queue_me(hb, &q, to);
+       futex_wait_queue_me(hb, &q, to, next);
+       next = NULL;

        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
@@ -2770,6 +2805,10 @@ static int futex_wait(u32 __user *uaddr, unsigned int 
flags, u32 val,
        ret = -ERESTART_RESTARTBLOCK;

 out:
+       if (next) {
+               wake_up_process(next);
+               put_task_struct(next);
+       }
        if (to) {
                hrtimer_cancel(&to->timer);
                destroy_hrtimer_on_stack(&to->timer);
@@ -2777,7 +2816,6 @@ static int futex_wait(u32 __user *uaddr, unsigned int 
flags, u32 val,
        return ret;
 }

-
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = restart->futex.uaddr;
@@ -2789,10 +2827,35 @@ static long futex_wait_restart(struct restart_block 
*restart)
        }
        restart->fn = do_no_restart_syscall;

-       return (long)futex_wait(uaddr, restart->futex.flags,
-                               restart->futex.val, tp, restart->futex.bitset);
+       return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val,
+                               tp, restart->futex.bitset, NULL);
 }

+static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
+                     ktime_t *abs_time, u32 __user *uaddr2)
+{
+       u32 bitset = FUTEX_BITSET_MATCH_ANY;
+       struct task_struct *next = NULL;
+       DEFINE_WAKE_Q(wake_q);
+       int ret;
+
+       ret = prepare_wake_q(uaddr2, flags, 1, bitset, &wake_q);
+       if (!wake_q_empty(&wake_q)) {
+               /* Pull the first wakee out of the queue to swap into. */
+               next = container_of(wake_q.first, struct task_struct, wake_q);
+               wake_q.first = wake_q.first->next;
+               next->wake_q.next = NULL;
+               /*
+                * Note that wake_up_q does not touch wake_q.last, so we
+                * do not bother with it here.
+                */
+               wake_up_q(&wake_q);
+       }
+       if (ret < 0)
+               return ret;
+
+       return futex_wait(uaddr, flags, val, abs_time, bitset, next);
+}

 /*
  * Userspace tried a 0 -> TID atomic transition of the futex value
@@ -3278,7 +3341,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
        }

        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
-       futex_wait_queue_me(hb, &q, to);
+       futex_wait_queue_me(hb, &q, to, NULL);

        spin_lock(&hb->lock);
        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -3808,7 +3871,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t 
*timeout,
                val3 = FUTEX_BITSET_MATCH_ANY;
                /* fall through */
        case FUTEX_WAIT_BITSET:
-               return futex_wait(uaddr, flags, val, timeout, val3);
+               return futex_wait(uaddr, flags, val, timeout, val3, NULL);
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
                /* fall through */
@@ -3832,6 +3895,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t 
*timeout,
                                             uaddr2);
        case FUTEX_CMP_REQUEUE_PI:
                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+       case FUTEX_SWAP:
+               return futex_swap(uaddr, flags, val, timeout, uaddr2);
        }
        return -ENOSYS;
 }
@@ -3848,7 +3913,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, 
val,

        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
                      cmd == FUTEX_WAIT_BITSET ||
-                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
+                     cmd == FUTEX_WAIT_REQUEUE_PI || cmd == FUTEX_SWAP)) {
                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
                        return -EFAULT;
                if (get_timespec64(&ts, utime))
@@ -3857,7 +3922,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, 
val,
                        return -EINVAL;

                t = timespec64_to_ktime(ts);
-               if (cmd == FUTEX_WAIT)
+               if (cmd == FUTEX_WAIT || cmd == FUTEX_SWAP)
                        t = ktime_add_safe(ktime_get(), t);
                tp = &t;
        }
--
2.25.1

Reply via email to