On 2019-09-28 04:29, Andrew Morton wrote:
On Wed, 25 Sep 2019 09:56:03 +0800 hev <r...@hev.cc> wrote:

From: Heiher <r...@hev.cc>

Take the case where we have:

        t0
         | (ew)
        e0
         | (et)
        e1
         | (lt)
        s0

t0: thread 0
e0: epoll fd 0
e1: epoll fd 1
s0: socket fd 0
ew: epoll_wait
et: edge-trigger
lt: level-trigger

We only need to wakeup nested epoll fds if something has been queued to the overflow list, since the ep_poll() traverses the rdllist during recursive poll
and thus events on the overflow list may not be visible yet.

Test code:

Look sane to me.  Do you have any performance testing results which
show a benefit?

epoll maintainership isn't exactly a hive of activity nowadays :(
Roman, would you please have time to review this?

So here is my observation: current patch does not fix the described
problem (double wakeup) for the case, when new event comes exactly
to the ->ovflist.  According to the patch this is the desired intention:

   /*
* We only need to wakeup nested epoll fds if something has been queued
    * to the overflow list, since the ep_poll() traverses the rdllist
* during recursive poll and thus events on the overflow list may not be
    * visible yet.
    */
    if (nepi != NULL)
       pwake++;

    ....

    if (pwake == 2)
       ep_poll_safewake(&ep->poll_wait);


but this actually means that we repeat the same behavior (double wakeup)
but only for the case, when event comes to the ->ovflist.

How to reproduce? Can be easily done (ok, not so easy but it is possible
to try): to the given userspace test we need to add one more socket and
immediately fire the event:

    e.events = EPOLLIN;
    if (epoll_ctl(efd[1], EPOLL_CTL_ADD, s2fd[0], &e) < 0)
       goto out;

    /*
     * Signal any fd to let epoll_wait() to call ep_scan_ready_list()
     * in order to "catch" it there and add new event to ->ovflist.
     */
     if (write(s2fd[1], "w", 1) != 1)
        goto out;

That is done in order to let the following epoll_wait() call to invoke
ep_scan_ready_list(), where we can "catch" and insert new event exactly
to the ->ovflist. In order to insert event exactly in the correct list
I introduce artificial delay.

Modified test and kernel patch is below.  Here is the output of the
testing tool with some debug lines from kernel:

  # ~/devel/test/edge-bug
  [   59.263178] ### sleep 2
  >> write to sock
  [   61.318243] ### done sleep
[ 61.318991] !!!!!!!!!!! ep_poll_safewake(&ep->poll_wait); events_in_rdllist=1, events_in_ovflist=1
  [   61.321204] ### sleep 2
  [   63.398325] ### done sleep
  error: What?! Again?!

First epoll_wait() call (ep_scan_ready_list()) observes 2 events
(see "!!!!!!!!!!! ep_poll_safewake" output line), exactly what we
wanted to achieve, so eventually ep_poll_safewake() is called again
which leads to double wakeup.

In my opinion current patch as it is should be dropped, it does not
fix the described problem but just hides it.

--
Roman


######### USERSPACE ##########

#include <unistd.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <stdio.h>
#include <pthread.h>

static void *do_thread(void *arg)
{
        int s = *(int *)arg;

        sleep(1);
        printf(">> write to sock\n");
        write(s, "w", 1);
}

int main(int argc, char *argv[])
{
        int s1fd[2];
        int s2fd[2];
        int efd[2];
        struct epoll_event e;

        if (socketpair(AF_UNIX, SOCK_STREAM, 0, s1fd) < 0)
                goto out;
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, s2fd) < 0)
                goto out;

        efd[0] = epoll_create(1);
        if (efd[0] < 0)
                goto out;

        efd[1] = epoll_create(1);
        if (efd[1] < 0)
                goto out;

        e.events = EPOLLIN;
        if (epoll_ctl(efd[1], EPOLL_CTL_ADD, s1fd[0], &e) < 0)
                goto out;

        e.events = EPOLLIN;
        if (epoll_ctl(efd[1], EPOLL_CTL_ADD, s2fd[0], &e) < 0)
                goto out;

        e.events = EPOLLIN | EPOLLET;
        if (epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e) < 0)
                goto out;

        /*
         * Signal any fd to let epoll_wait() to call ep_scan_ready_list()
         * in order to "catch" it there and add new event to ->ovflist.
         */
        if (write(s2fd[1], "w", 1) != 1)
                goto out;

        pthread_t thr;
        pthread_create(&thr, NULL, do_thread, &s1fd[1]);
        if (epoll_wait(efd[0], &e, 1, 0) != 1) {
                goto out;
        }
        pthread_join(thr, NULL);

        if (epoll_wait(efd[0], &e, 1, 0) != 0) {
                printf("error: What?! Again?!\n");
                goto out;
        }

        return 0;

out:
        return -1;
}


######### KERNEL ##########

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8bc064630be0..edba7ab45083 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,8 @@
 #include <linux/rculist.h>
 #include <net/busy_poll.h>

+static bool is_send_events_call;
+
 /*
  * LOCKING:
  * There are three level of locking required by epoll :
@@ -672,6 +674,8 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
        __poll_t res;
        int pwake = 0;
        struct epitem *epi, *nepi;
+       unsigned events_in_rdllist = 0;
+       unsigned events_in_ovflist = 0;
        LIST_HEAD(txlist);

        lockdep_assert_irqs_enabled();
@@ -693,23 +697,52 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
         * in a lockless way.
         */
        write_lock_irq(&ep->lock);
+
+       /* XXX Count events */
+       if (!strcmp("edge-bug", current->comm) && depth) {
+               struct list_head *l;
+               list_for_each(l, &ep->rdllist)
+                       events_in_rdllist++;
+       }
        list_splice_init(&ep->rdllist, &txlist);
        WRITE_ONCE(ep->ovflist, NULL);
        write_unlock_irq(&ep->lock);

+ if (!strcmp("edge-bug", current->comm) && depth && is_send_events_call) {
+               /*
+                * XXX Introduce delay to let userspace fire event
+                * XXX directly to ovflist.
+                */
+               pr_err("### sleep 2\n");
+               msleep(2000);
+               pr_err("### done sleep\n");
+       }
+
+
        /*
         * Now call the callback function.
         */
        res = (*sproc)(ep, &txlist, priv);

        write_lock_irq(&ep->lock);
+       nepi = READ_ONCE(ep->ovflist);
+       /*
+ * We only need to wakeup nested epoll fds if something has been queued
+        * to the overflow list, since the ep_poll() traverses the rdllist
+ * during recursive poll and thus events on the overflow list may not be
+        * visible yet.
+        */
+       if (nepi != NULL)
+               pwake++;
        /*
         * During the time we spent inside the "sproc" callback, some
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
-       for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
+       for (; (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+               /* XXX Count events */
+               events_in_ovflist++;
                /*
                 * We need to check if the item is already in the list.
                 * During the "sproc" callback execution time, items are
@@ -754,8 +787,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
                mutex_unlock(&ep->mtx);

        /* We have to call this outside the lock */
-       if (pwake)
+       if (pwake == 2) {
+ pr_err("!!!!!!!!!!! ep_poll_safewake(&ep->poll_wait); events_in_rdllist=%d, events_in_ovflist=%d\n",
+                      events_in_rdllist, events_in_ovflist);
                ep_poll_safewake(&ep->poll_wait);
+       }

        return res;
 }
@@ -1925,9 +1961,16 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
         * there's still timeout left over, we go trying again in search of
         * more luck.
         */
+
+       /* XXX Catch only ep_scan_ready_list() called from here */
+       if (!strcmp("edge-bug", current->comm))
+               is_send_events_call = 1;
        if (!res && eavail &&
-           !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+           !(res = ep_send_events(ep, events, maxevents)) && !timed_out) {
+               is_send_events_call = 0;
                goto fetch_events;
+       }
+       is_send_events_call = 0;

        if (waiter) {
                spin_lock_irq(&ep->wq.lock);

Reply via email to