Now that connect() permanently registers a callback routine, we can induce
extra overhead in unix_dgram_recvmsg(), which unconditionally wakes up
its peer_wait queue on every receive. This patch makes the wakeup there
conditional on there being waiters.

Signed-off-by: Jason Baron <jba...@akamai.com>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    | 85 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 6a4a345..cf21ffd 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -61,6 +61,7 @@ struct unix_sock {
        unsigned long           flags;
 #define UNIX_GC_CANDIDATE      0
 #define UNIX_GC_MAYBE_CYCLE    1
+#define UNIX_NOSPACE           2
        struct socket_wq        peer_wq;
        wait_queue_t            wait;
 };
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f789423..66979d4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -326,7 +326,7 @@ found:
        return s;
 }
 
-static inline int unix_writable(struct sock *sk)
+static inline bool unix_writable(struct sock *sk)
 {
        return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 }
@@ -1079,6 +1079,12 @@ static long unix_wait_for_peer(struct sock *other, long 
timeo)
 
        prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
 
+       set_bit(UNIX_NOSPACE, &u->flags);
+       /* Ensure that we either see space in the peer sk_receive_queue via the
+        * unix_recvq_full() check below, or we receive a wakeup when it
+        * empties. Pairs with the mb in unix_dgram_recvmsg().
+        */
+       smp_mb__after_atomic();
        sched = !sock_flag(other, SOCK_DEAD) &&
                !(other->sk_shutdown & RCV_SHUTDOWN) &&
                unix_recvq_full(other);
@@ -1623,17 +1629,27 @@ restart:
 
        if (unix_peer(other) != sk && unix_recvq_full(other)) {
                if (!timeo) {
-                       err = -EAGAIN;
-                       goto out_unlock;
-               }
+                       set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
+                       /* Ensure that we either see space in the peer
+                        * sk_receive_queue via the unix_recvq_full() check
+                        * below, or we receive a wakeup when it empties. This
+                        * makes sure that epoll ET triggers correctly. Pairs
+                        * with the mb in unix_dgram_recvmsg().
+                        */
+                       smp_mb__after_atomic();
+                       if (unix_recvq_full(other)) {
+                               err = -EAGAIN;
+                               goto out_unlock;
+                       }
+               } else {
+                       timeo = unix_wait_for_peer(other, timeo);
 
-               timeo = unix_wait_for_peer(other, timeo);
+                       err = sock_intr_errno(timeo);
+                       if (signal_pending(current))
+                               goto out_free;
 
-               err = sock_intr_errno(timeo);
-               if (signal_pending(current))
-                       goto out_free;
-
-               goto restart;
+                       goto restart;
+               }
        }
 
        if (sock_flag(other, SOCK_RCVTSTAMP))
@@ -1939,8 +1955,19 @@ static int unix_dgram_recvmsg(struct socket *sock, 
struct msghdr *msg,
                goto out_unlock;
        }
 
-       wake_up_interruptible_sync_poll(&u->peer_wait,
-                                       POLLOUT | POLLWRNORM | POLLWRBAND);
+       /* Ensure that waiters on our sk->sk_receive_queue draining that check
+        * via unix_recvq_full() either see space in the queue or get a wakeup
+        * below. sk->sk_receive_queue is reduece by the __skb_recv_datagram()
+        * call above. Pairs with the mb in unix_dgram_sendmsg(),
+        *unix_dgram_poll(), and unix_wait_for_peer().
+        */
+       smp_mb();
+       if (test_bit(UNIX_NOSPACE, &u->flags)) {
+               clear_bit(UNIX_NOSPACE, &u->flags);
+               wake_up_interruptible_sync_poll(&u->peer_wait,
+                                               POLLOUT | POLLWRNORM |
+                                               POLLWRBAND);
+       }
 
        if (msg->msg_name)
                unix_copy_addr(msg, skb->sk);
@@ -2432,11 +2459,19 @@ static unsigned int unix_poll(struct file *file, struct 
socket *sock, poll_table
        return mask;
 }
 
+static bool unix_dgram_writable(struct sock *sk, struct sock *other)
+{
+       if (other && unix_peer(other) != sk && unix_recvq_full(other))
+               return false;
+
+       return unix_writable(sk);
+}
+
 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
                                    poll_table *wait)
 {
        struct sock *sk = sock->sk, *other;
-       unsigned int mask, writable;
+       unsigned int mask;
 
        sock_poll_wait(file, sk_sleep(sk), wait);
        mask = 0;
@@ -2468,20 +2503,22 @@ static unsigned int unix_dgram_poll(struct file *file, 
struct socket *sock,
        if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
                return mask;
 
-       writable = unix_writable(sk);
        other = unix_peer_get(sk);
-       if (other) {
-               if (unix_peer(other) != sk) {
-                       if (unix_recvq_full(other))
-                               writable = 0;
-               }
-               sock_put(other);
-       }
-
-       if (writable)
+       if (unix_dgram_writable(sk, other)) {
                mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-       else
+       } else {
                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+               set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
+               /* Ensure that we either see space in the peer sk_receive_queue
+                * via the unix_recvq_full() check below, or we receive a wakeup
+                * when it empties. Pairs with the mb in unix_dgram_recvmsg().
+                */
+               smp_mb__after_atomic();
+               if (unix_dgram_writable(sk, other))
+                       mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+       }
+       if (other)
+               sock_put(other);
 
        return mask;
 }
-- 
2.6.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to