On 9/23/19 2:48 PM, Jens Axboe wrote:
> On 9/23/19 10:32 AM, Pavel Begunkov wrote:
>> Sorry, mixed the threads.
>>
>>>>
>>>> I'm not sure an extension is needed for such a special interface, why not
>>>> just put a ->threshold value next to the ctx->wait field and use either
>>>> the regular wait_event() APIs with the proper condition, or
>>>> wait_event_cmd() style APIs if you absolutely need something more complex
>>>> to happen inside?
>> Ingo,
>> io_uring works well without this patch just using wait_event_*() with
>> proper condition, but there are performance issues with spurious
>> wakeups. Detailed description in the previous mail.
>> Am I missing something?
> 
> I think we can do the same thing, just wrapping the waitqueue in a
> structure with a count in it, on the stack. Got some flight time
> coming up later today, let me try and cook up a patch.

Totally untested, and sent out 5 min before departure... But something
like this.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca7570aca430..c2f9e1da26dd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,6 +2768,37 @@ static int io_ring_submit(struct io_ring_ctx *ctx, 
unsigned int to_submit,
        return submit;
 }
 
+struct io_wait_queue {
+       struct wait_queue_entry wq;
+       struct io_ring_ctx *ctx;
+       struct task_struct *task;
+       unsigned to_wait;
+       unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+       struct io_ring_ctx *ctx = iowq->ctx;
+
+       return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+                       atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+                           int wake_flags, void *key)
+{
+       struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+                                                       wq);
+
+       if (io_should_wake(iowq)) {
+               list_del_init(&curr->entry);
+               wake_up_process(iowq->task);
+               return 1;
+       }
+
+       return -1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2775,8 +2806,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, 
unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                          const sigset_t __user *sig, size_t sigsz)
 {
+       struct io_wait_queue iowq = {
+               .wq = {
+                       .func   = io_wake_function,
+                       .entry  = LIST_HEAD_INIT(iowq.wq.entry),
+               },
+               .task           = current,
+               .ctx            = ctx,
+               .to_wait        = min_events,
+       };
        struct io_rings *rings = ctx->rings;
-       unsigned nr_timeouts;
        int ret;
 
        if (io_cqring_events(rings) >= min_events)
@@ -2795,15 +2834,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int 
min_events,
                        return ret;
        }
 
-       nr_timeouts = atomic_read(&ctx->cq_timeouts);
-       /*
-        * Return if we have enough events, or if a timeout occured since
-        * we started waiting. For timeouts, we always want to return to
-        * userspace.
-        */
-       ret = wait_event_interruptible(ctx->wait,
-                               io_cqring_events(rings) >= min_events ||
-                               atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+       iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+       prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
+       do {
+               if (io_should_wake(&iowq))
+                       break;
+               schedule();
+               set_current_state(TASK_INTERRUPTIBLE);
+       } while (1);
+       finish_wait(&ctx->wait, &iowq.wq);
+
        restore_saved_sigmask_unless(ret == -ERESTARTSYS);
        if (ret == -ERESTARTSYS)
                ret = -EINTR;

-- 
Jens Axboe

Reply via email to