Author: mmacy
Date: Thu May 17 17:59:35 2018
New Revision: 333744
URL: https://svnweb.freebsd.org/changeset/base/333744

Log:
  AF_UNIX: make unix socket locking finer grained
  
  This change moves to using a reference count across lock drop / reacquire
  to guarantee liveness.
  
  Currently sends on unix sockets contend heavily on read locking the list lock.
  unix1_processes in will-it-scale peaks at 6 processes and then declines.
  
  With this change I get a substantial improvement in number of operations per 
second
  with 96 processes:
  
  x before
  + after
      N           Min           Max        Median           Avg        Stddev
  x  11       1688420       1696389       1693578     1692766.3     2971.1702
  +  10      63417955      71030114      70662504      69576423     2374684.6
  Difference at 95.0% confidence
          6.78837e+07 +/- 1.49463e+06
          4010.22% +/- 88.4246%
          (Student's t, pooled s = 1.63437e+06)
  
  And even for 2 processes shows a ~18% improvement.
  "Small" iron changes (1, 2, and 4 processes):
  
  x before1
  + after1.2
  +------------------------------------------------------------------------+
  |                                                                  +     |
  |                                                           x      +     |
  |                                                           x      +     |
  |                                                           x      +     |
  |                                                           x     ++     |
  |                                                          xx     ++     |
  |x                                                       x xx     ++     |
  |                                  |__________________A_____M_____AM____||
  +------------------------------------------------------------------------+
      N           Min           Max        Median           Avg        Stddev
  x  10       1131648       1197750     1197138.5     1190369.3     20651.839
  +  10       1203840       1205056       1204919     1204827.9     353.27404
  Difference at 95.0% confidence
          14458.6 +/- 13723
          1.21463% +/- 1.16683%
          (Student's t, pooled s = 14605.2)
  
  x before2
  + after2.2
  +------------------------------------------------------------------------+
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |           x                                                           +|
  |           x                                                           +|
  |         x xx                                                          +|
  |x        xxxx                                                          +|
  |      |___AM_|                                                         A|
  +------------------------------------------------------------------------+
      N           Min           Max        Median           Avg        Stddev
  x  10       1972843       2045866     2038186.5     2030443.8     21367.694
  +  10       2400853       2402196     2401043.5     2401172.7     385.40024
  Difference at 95.0% confidence
          370729 +/- 14198.9
          18.2585% +/- 0.826943%
          (Student's t, pooled s = 15111.7)
  
  x before4
  + after4.2
      N           Min           Max        Median           Avg        Stddev
  x  10       3986994       3991728     3990137.5     3989985.2     1300.0164
  +  10       4799990       4806664     4806116.5       4805194     1990.6625
  Difference at 95.0% confidence
          815209 +/- 1579.64
          20.4314% +/- 0.0421713%
          (Student's t, pooled s = 1681.19)
  
  Tested by: pho
  Reported by:  mjg
  Approved by:  sbruno
  Sponsored by: Limelight Networks
  Differential Revision:        https://reviews.freebsd.org/D15430

Modified:
  head/sys/kern/uipc_usrreq.c
  head/sys/sys/unpcb.h

Modified: head/sys/kern/uipc_usrreq.c
==============================================================================
--- head/sys/kern/uipc_usrreq.c Thu May 17 17:57:41 2018        (r333743)
+++ head/sys/kern/uipc_usrreq.c Thu May 17 17:59:35 2018        (r333744)
@@ -191,13 +191,41 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
 /*
  * Locking and synchronization:
  *
- * Two types of locks exist in the local domain socket implementation: a
- * a global linkage rwlock and per-unpcb mutexes.  The linkage lock protects
- * the socket count, global generation number, stream/datagram global lists and
- * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
- * held exclusively over the acquisition of multiple unpcb locks to prevent
- * deadlock.
+ * Three types of locks exist in the local domain socket implementation: a
+ * a global linkage rwlock, the mtxpool lock, and per-unpcb mutexes.
+ * The linkage lock protects the socket count, global generation number,
+ * and stream/datagram global lists.
  *
+ * The mtxpool lock protects the vnode from being modified while referenced.
+ * Lock ordering requires that it be acquired before any unpcb locks.
+ *
+ * The unpcb lock (unp_mtx) protects all fields in the unpcb. Of particular
+ * note is that this includes the unp_conn field. So long as the unpcb lock
+ * is held the reference to the unpcb pointed to by unp_conn is valid. If we
+ * require that the unpcb pointed to by unp_conn remain live in cases where
+ * we need to drop the unp_mtx as when we need to acquire the lock for a
+ * second unpcb the caller must first acquire an additional reference on the
+ * second unpcb and then revalidate any state (typically check that unp_conn
+ * is non-NULL) upon requiring the initial unpcb lock. The lock ordering
+ * between unpcbs is the conventional ascending address order. Two helper
+ * routines exist for this:
+ *
+ *   - unp_pcb_lock2(unp, unp2) - which just acquires the two locks in the
+ *     safe ordering.
+ *
+ *   - unp_pcb_owned_lock2(unp, unp2, freed) - the lock for unp is held
+ *     when called. If unp is unlocked and unp2 is subsequently freed
+ *     freed will be set to 1.
+ *
+ * The helper routines for references are:
+ *
+ *   - unp_pcb_hold(unp): Can be called any time we currently hold a valid
+ *     reference to unp.
+ *
+ *    - unp_pcb_rele(unp): The caller must hold the unp lock. If we are
+ *      releasing the last reference, detach must have been called thus
+ *      unp->unp_socket be NULL.
+ *
  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
  * allocated in pru_attach() and freed in pru_detach().  The validity of that
  * pointer is an invariant, so no lock is required to dereference the so_pcb
@@ -210,16 +238,9 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
  * to the unpcb is held.  Typically, this reference will be from the socket,
  * or from another unpcb when the referring unpcb's lock is held (in order
  * that the reference not be invalidated during use).  For example, to follow
- * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
- * as unp_socket remains valid as long as the reference to unp_conn is valid.
+ * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
+ * that detach is not run clearing unp_socket.
  *
- * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
- * atomic reads without the lock may be performed "lockless", but more
- * complex reads and read-modify-writes require the mutex to be held.  No
- * lock order is defined between unpcb locks -- multiple unpcb locks may be
- * acquired at the same time only when holding the linkage rwlock
- * exclusively, which prevents deadlocks.
- *
  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
  * protocols, bind() is a non-atomic operation, and connect() requires
  * potential sleeping in the protocol, due to potentially waiting on local or
@@ -257,13 +278,19 @@ static struct mtx unp_defers_lock;
 #define        UNP_DEFERRED_LOCK()             mtx_lock(&unp_defers_lock)
 #define        UNP_DEFERRED_UNLOCK()           mtx_unlock(&unp_defers_lock)
 
+#define UNP_REF_LIST_LOCK()            UNP_DEFERRED_LOCK();
+#define UNP_REF_LIST_UNLOCK()          UNP_DEFERRED_UNLOCK();
+
 #define UNP_PCB_LOCK_INIT(unp)         mtx_init(&(unp)->unp_mtx,       \
                                            "unp_mtx", "unp_mtx",       \
-                                           MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+                                           MTX_DUPOK|MTX_DEF)
 #define        UNP_PCB_LOCK_DESTROY(unp)       mtx_destroy(&(unp)->unp_mtx)
 #define        UNP_PCB_LOCK(unp)               mtx_lock(&(unp)->unp_mtx)
+#define        UNP_PCB_TRYLOCK(unp)            mtx_trylock(&(unp)->unp_mtx)
 #define        UNP_PCB_UNLOCK(unp)             mtx_unlock(&(unp)->unp_mtx)
+#define        UNP_PCB_OWNED(unp)              mtx_owned(&(unp)->unp_mtx)
 #define        UNP_PCB_LOCK_ASSERT(unp)        mtx_assert(&(unp)->unp_mtx, 
MA_OWNED)
+#define        UNP_PCB_UNLOCK_ASSERT(unp)      mtx_assert(&(unp)->unp_mtx, 
MA_NOTOWNED)
 
 static int     uipc_connect2(struct socket *, struct socket *);
 static int     uipc_ctloutput(struct socket *, struct sockopt *);
@@ -289,6 +316,75 @@ static int unp_externalize_fp(struct file *);
 static struct mbuf     *unp_addsockcred(struct thread *, struct mbuf *);
 static void    unp_process_defers(void * __unused, int);
 
+
+static void
+unp_pcb_hold(struct unpcb *unp)
+{
+       MPASS(unp->unp_refcount);
+       refcount_acquire(&unp->unp_refcount);
+}
+
+static int
+unp_pcb_rele(struct unpcb *unp)
+{
+       int freed;
+
+       UNP_PCB_LOCK_ASSERT(unp);
+       MPASS(unp->unp_refcount);
+       if ((freed = refcount_release(&unp->unp_refcount))) {
+               /* we got here with having detached? */
+               MPASS(unp->unp_socket == NULL);
+               UNP_PCB_UNLOCK(unp);
+               UNP_PCB_LOCK_DESTROY(unp);
+               uma_zfree(unp_zone, unp);
+       }
+       return (freed);
+}
+
+static void
+unp_pcb_lock2(struct unpcb *unp, struct unpcb *unp2)
+{
+       UNP_PCB_UNLOCK_ASSERT(unp);
+       UNP_PCB_UNLOCK_ASSERT(unp2);
+       if ((uintptr_t)unp2 > (uintptr_t)unp) {
+               UNP_PCB_LOCK(unp);
+               UNP_PCB_LOCK(unp2);
+       } else {
+               UNP_PCB_LOCK(unp2);
+               UNP_PCB_LOCK(unp);
+       }
+}
+
+static __noinline void
+unp_pcb_owned_lock2_slowpath(struct unpcb *unp, struct unpcb **unp2p, int 
*freed)
+
+{
+       struct unpcb *unp2;
+
+       unp2 = *unp2p;
+       unp_pcb_hold((unp2));
+       UNP_PCB_UNLOCK((unp));
+       UNP_PCB_LOCK((unp2));
+       UNP_PCB_LOCK((unp));
+       *freed = unp_pcb_rele((unp2));
+       if (*freed)
+               *unp2p = NULL;
+}
+
+#define unp_pcb_owned_lock2(unp, unp2, freed) do {                             
        \
+               freed = 0;                                                      
                                                \
+               UNP_PCB_LOCK_ASSERT((unp));                                     
                                \
+               UNP_PCB_UNLOCK_ASSERT((unp2));                                  
                        \
+               if (__predict_true(UNP_PCB_TRYLOCK((unp2))))                    
        \
+                       break;                                                  
                                                \
+               else if ((uintptr_t)(unp2) > (uintptr_t)(unp))                  
        \
+                       UNP_PCB_LOCK((unp2));                                   
                                \
+               else {                                                          
                                                \
+                       unp_pcb_owned_lock2_slowpath((unp), &(unp2), &freed);   
\
+               }                                                               
                                                        \
+} while (0)
+
+
 /*
  * Definitions of protocols supported in the LOCAL domain.
  */
@@ -344,17 +440,16 @@ uipc_abort(struct socket *so)
 
        unp = sotounpcb(so);
        KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
+       UNP_PCB_UNLOCK_ASSERT(unp);
 
-       UNP_LINK_WLOCK();
        UNP_PCB_LOCK(unp);
        unp2 = unp->unp_conn;
        if (unp2 != NULL) {
-               UNP_PCB_LOCK(unp2);
+               unp_pcb_hold(unp2);
+               UNP_PCB_UNLOCK(unp);
                unp_drop(unp2);
-               UNP_PCB_UNLOCK(unp2);
-       }
-       UNP_PCB_UNLOCK(unp);
-       UNP_LINK_WUNLOCK();
+       } else
+               UNP_PCB_UNLOCK(unp);
 }
 
 static int
@@ -551,14 +646,12 @@ restart:
        ASSERT_VOP_ELOCKED(vp, "uipc_bind");
        soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
 
-       UNP_LINK_WLOCK();
        UNP_PCB_LOCK(unp);
        VOP_UNP_BIND(vp, unp);
        unp->unp_vnode = vp;
        unp->unp_addr = soun;
        unp->unp_flags &= ~UNP_BINDING;
        UNP_PCB_UNLOCK(unp);
-       UNP_LINK_WUNLOCK();
        VOP_UNLOCK(vp, 0);
        vn_finished_write(mp);
        free(buf, M_TEMP);
@@ -585,9 +678,7 @@ uipc_connect(struct socket *so, struct sockaddr *nam, 
        int error;
 
        KASSERT(td == curthread, ("uipc_connect: td != curthread"));
-       UNP_LINK_WLOCK();
        error = unp_connect(so, nam, td);
-       UNP_LINK_WUNLOCK();
        return (error);
 }
 
@@ -598,9 +689,7 @@ uipc_connectat(int fd, struct socket *so, struct socka
        int error;
 
        KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
-       UNP_LINK_WLOCK();
        error = unp_connectat(fd, so, nam, td);
-       UNP_LINK_WUNLOCK();
        return (error);
 }
 
@@ -609,26 +698,41 @@ uipc_close(struct socket *so)
 {
        struct unpcb *unp, *unp2;
        struct vnode *vp = NULL;
-
+       struct mtx *vplock;
+       int freed;
        unp = sotounpcb(so);
        KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
 
-       UNP_LINK_WLOCK();
+
+       vplock = NULL;
+       if ((vp = unp->unp_vnode) != NULL) {
+               vplock = mtx_pool_find(mtxpool_sleep, vp);
+               mtx_lock(vplock);
+       }
        UNP_PCB_LOCK(unp);
-       unp2 = unp->unp_conn;
-       if (unp2 != NULL) {
-               UNP_PCB_LOCK(unp2);
-               unp_disconnect(unp, unp2);
-               UNP_PCB_UNLOCK(unp2);
+       if (vp && unp->unp_vnode == NULL) {
+               mtx_unlock(vplock);
+               vp = NULL;
        }
-       if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) {
+       if (vp != NULL) {
                VOP_UNP_DETACH(vp);
                unp->unp_vnode = NULL;
        }
-       UNP_PCB_UNLOCK(unp);
-       UNP_LINK_WUNLOCK();
-       if (vp)
+       unp2 = unp->unp_conn;
+       unp_pcb_hold(unp);
+       if (unp2 != NULL) {
+               unp_pcb_hold(unp2);
+               unp_pcb_owned_lock2(unp, unp2, freed);
+               unp_disconnect(unp, unp2);
+               if (unp_pcb_rele(unp2) == 0)
+                       UNP_PCB_UNLOCK(unp2);
+       }
+       if (unp_pcb_rele(unp) == 0)
+               UNP_PCB_UNLOCK(unp);
+       if (vp) {
+               mtx_unlock(vplock);
                vrele(vp);
+       }
 }
 
 static int
@@ -637,17 +741,14 @@ uipc_connect2(struct socket *so1, struct socket *so2)
        struct unpcb *unp, *unp2;
        int error;
 
-       UNP_LINK_WLOCK();
        unp = so1->so_pcb;
        KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
-       UNP_PCB_LOCK(unp);
        unp2 = so2->so_pcb;
        KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
-       UNP_PCB_LOCK(unp2);
+       unp_pcb_lock2(unp, unp2);
        error = unp_connect2(so1, so2, PRU_CONNECT2);
        UNP_PCB_UNLOCK(unp2);
        UNP_PCB_UNLOCK(unp);
-       UNP_LINK_WUNLOCK();
        return (error);
 }
 
@@ -655,6 +756,7 @@ static void
 uipc_detach(struct socket *so)
 {
        struct unpcb *unp, *unp2;
+       struct mtx *vplock;
        struct sockaddr_un *saved_unp_addr;
        struct vnode *vp;
        int freeunp, local_unp_rights;
@@ -669,49 +771,77 @@ uipc_detach(struct socket *so)
        LIST_REMOVE(unp, unp_link);
        unp->unp_gencnt = ++unp_gencnt;
        --unp_count;
+       UNP_LINK_WUNLOCK();
+
+       UNP_PCB_UNLOCK_ASSERT(unp);
+ restart:
+       if ((vp = unp->unp_vnode) != NULL) {
+               vplock = mtx_pool_find(mtxpool_sleep, vp);
+               mtx_lock(vplock);
+       }
        UNP_PCB_LOCK(unp);
-       if ((unp->unp_flags & UNP_NASCENT) != 0)
+       if ((unp2 = unp->unp_conn) != NULL) {
+               unp_pcb_owned_lock2(unp, unp2, freeunp);
+               if (freeunp)
+                       unp2 = NULL;
+       }
+       if (unp->unp_vnode != vp &&
+               unp->unp_vnode != NULL) {
+               mtx_unlock(vplock);
+               UNP_PCB_UNLOCK(unp);
+               if (unp2)
+                       UNP_PCB_UNLOCK(unp2);
+               goto restart;
+       }
+       if ((unp->unp_flags & UNP_NASCENT) != 0) {
+               if (unp2)
+                       UNP_PCB_UNLOCK(unp2);
                goto teardown;
-
+       }
        if ((vp = unp->unp_vnode) != NULL) {
                VOP_UNP_DETACH(vp);
                unp->unp_vnode = NULL;
        }
-       unp2 = unp->unp_conn;
+       unp_pcb_hold(unp);
        if (unp2 != NULL) {
-               UNP_PCB_LOCK(unp2);
+               unp_pcb_hold(unp2);
                unp_disconnect(unp, unp2);
-               UNP_PCB_UNLOCK(unp2);
+               if (unp_pcb_rele(unp2) == 0)
+                       UNP_PCB_UNLOCK(unp2);
        }
-
-       /*
-        * We hold the linkage lock exclusively, so it's OK to acquire
-        * multiple pcb locks at a time.
-        */
+       UNP_PCB_UNLOCK(unp);
+       UNP_REF_LIST_LOCK();
        while (!LIST_EMPTY(&unp->unp_refs)) {
                struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
 
-               UNP_PCB_LOCK(ref);
+               unp_pcb_hold(ref);
+               UNP_REF_LIST_UNLOCK();
+
+               MPASS(ref != unp);
+               UNP_PCB_UNLOCK_ASSERT(ref);
                unp_drop(ref);
-               UNP_PCB_UNLOCK(ref);
+               UNP_REF_LIST_LOCK();
        }
+
+       UNP_REF_LIST_UNLOCK();
+       UNP_PCB_LOCK(unp);
+       freeunp = unp_pcb_rele(unp);
+       MPASS(freeunp == 0);
        local_unp_rights = unp_rights;
 teardown:
-       UNP_LINK_WUNLOCK();
        unp->unp_socket->so_pcb = NULL;
        saved_unp_addr = unp->unp_addr;
        unp->unp_addr = NULL;
-       unp->unp_refcount--;
-       freeunp = (unp->unp_refcount == 0);
+       unp->unp_socket = NULL;
+       freeunp = unp_pcb_rele(unp);
        if (saved_unp_addr != NULL)
                free(saved_unp_addr, M_SONAME);
-       if (freeunp) {
-               UNP_PCB_LOCK_DESTROY(unp);
-               uma_zfree(unp_zone, unp);
-       } else
+       if (!freeunp)
                UNP_PCB_UNLOCK(unp);
-       if (vp)
+       if (vp) {
+               mtx_unlock(vplock);
                vrele(vp);
+       }
        if (local_unp_rights)
                taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
 }
@@ -720,20 +850,28 @@ static int
 uipc_disconnect(struct socket *so)
 {
        struct unpcb *unp, *unp2;
+       int freed;
 
        unp = sotounpcb(so);
        KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
 
-       UNP_LINK_WLOCK();
        UNP_PCB_LOCK(unp);
-       unp2 = unp->unp_conn;
-       if (unp2 != NULL) {
-               UNP_PCB_LOCK(unp2);
-               unp_disconnect(unp, unp2);
-               UNP_PCB_UNLOCK(unp2);
+       if ((unp2 = unp->unp_conn) == NULL) {
+               UNP_PCB_UNLOCK(unp);
+               return (0);
        }
-       UNP_PCB_UNLOCK(unp);
-       UNP_LINK_WUNLOCK();
+       unp_pcb_owned_lock2(unp, unp2, freed);
+       if (__predict_false(freed)) {
+               UNP_PCB_UNLOCK(unp);
+               return (0);
+       }
+       unp_pcb_hold(unp2);
+       unp_pcb_hold(unp);
+       unp_disconnect(unp, unp2);
+       if (unp_pcb_rele(unp) == 0)
+               UNP_PCB_UNLOCK(unp);
+       if (unp_pcb_rele(unp2) == 0)
+               UNP_PCB_UNLOCK(unp2);
        return (0);
 }
 
@@ -852,13 +990,35 @@ uipc_rcvd(struct socket *so, int flags)
 }
 
 static int
+connect_internal(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+       int error;
+       struct unpcb *unp;
+
+       unp = so->so_pcb;
+       if (unp->unp_conn != NULL)
+               return (EISCONN);
+       error = unp_connect(so, nam, td);
+       if (error)
+               return (error);
+       UNP_PCB_LOCK(unp);
+       if (unp->unp_conn == NULL) {
+               UNP_PCB_UNLOCK(unp);
+               if (error == 0)
+                       error = ENOTCONN;
+       }
+       return (error);
+}
+
+
+static int
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
        struct unpcb *unp, *unp2;
        struct socket *so2;
        u_int mbcnt, sbcc;
-       int error = 0;
+       int freed, error;
 
        unp = sotounpcb(so);
        KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
@@ -866,49 +1026,66 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
            so->so_type == SOCK_SEQPACKET,
            ("%s: socktype %d", __func__, so->so_type));
 
+       freed = error = 0;
        if (flags & PRUS_OOB) {
                error = EOPNOTSUPP;
                goto release;
        }
        if (control != NULL && (error = unp_internalize(&control, td)))
                goto release;
-       if ((nam != NULL) || (flags & PRUS_EOF))
-               UNP_LINK_WLOCK();
-       else
-               UNP_LINK_RLOCK();
+
+       unp2 = NULL;
        switch (so->so_type) {
        case SOCK_DGRAM:
        {
                const struct sockaddr *from;
 
-               unp2 = unp->unp_conn;
                if (nam != NULL) {
-                       UNP_LINK_WLOCK_ASSERT();
-                       if (unp2 != NULL) {
-                               error = EISCONN;
+                       /*
+                        * We return with UNP_PCB_LOCK_HELD so we know that
+                        * the reference is live if the pointer is valid.
+                        */
+                       if ((error = connect_internal(so, nam, td)))
                                break;
-                       }
-                       error = unp_connect(so, nam, td);
-                       if (error)
-                               break;
+                       MPASS(unp->unp_conn != NULL);
                        unp2 = unp->unp_conn;
-               }
+               } else  {
+                       UNP_PCB_LOCK(unp);
 
+                       /*
+                        * Because connect() and send() are non-atomic in a 
sendto()
+                        * with a target address, it's possible that the socket 
will
+                        * have disconnected before the send() can run.  In 
that case
+                        * return the slightly counter-intuitive but otherwise
+                        * correct error that the socket is not connected.
+                        */
+                       if ((unp2 = unp->unp_conn)  == NULL) {
+                               UNP_PCB_UNLOCK(unp);
+                               error = ENOTCONN;
+                               break;
+                       }
+               }
+               unp_pcb_owned_lock2(unp, unp2, freed);
+               if (__predict_false(freed)) {
+                       UNP_PCB_UNLOCK(unp);
+                       error = ENOTCONN;
+                       break;
+               }
                /*
-                * Because connect() and send() are non-atomic in a sendto()
-                * with a target address, it's possible that the socket will
-                * have disconnected before the send() can run.  In that case
-                * return the slightly counter-intuitive but otherwise
-                * correct error that the socket is not connected.
+                * The socket referencing unp2 may have been closed
+                * or unp may have been disconnected if the unp lock
+                * was dropped to acquire unp2.
                 */
-               if (unp2 == NULL) {
+               if (__predict_false(unp->unp_conn == NULL) ||
+                       unp2->unp_socket == NULL) {
+                       UNP_PCB_UNLOCK(unp);
+                       if (unp_pcb_rele(unp2) == 0)
+                               UNP_PCB_UNLOCK(unp2);
                        error = ENOTCONN;
                        break;
                }
-               /* Lockless read. */
                if (unp2->unp_flags & UNP_WANTCRED)
                        control = unp_addsockcred(td, control);
-               UNP_PCB_LOCK(unp);
                if (unp->unp_addr != NULL)
                        from = (struct sockaddr *)unp->unp_addr;
                else
@@ -924,12 +1101,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
                        SOCKBUF_UNLOCK(&so2->so_rcv);
                        error = ENOBUFS;
                }
-               if (nam != NULL) {
-                       UNP_LINK_WLOCK_ASSERT();
-                       UNP_PCB_LOCK(unp2);
+               if (nam != NULL)
                        unp_disconnect(unp, unp2);
-                       UNP_PCB_UNLOCK(unp2);
-               }
+               UNP_PCB_UNLOCK(unp2);
                UNP_PCB_UNLOCK(unp);
                break;
        }
@@ -938,42 +1112,37 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
        case SOCK_STREAM:
                if ((so->so_state & SS_ISCONNECTED) == 0) {
                        if (nam != NULL) {
-                               UNP_LINK_WLOCK_ASSERT();
-                               error = unp_connect(so, nam, td);
-                               if (error)
-                                       break;  /* XXX */
-                       } else {
+                               if ((error = connect_internal(so, nam, td)))
+                                       break;
+                       } else  {
                                error = ENOTCONN;
                                break;
                        }
-               }
-
-               /* Lockless read. */
-               if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+               } else if ((unp2 = unp->unp_conn) == NULL) {
+                       error = ENOTCONN;
+                       break;
+               } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
                        error = EPIPE;
                        break;
+               } else {
+                       UNP_PCB_LOCK(unp);
+                       if ((unp2 = unp->unp_conn) == NULL) {
+                               UNP_PCB_UNLOCK(unp);
+                               error = ENOTCONN;
+                               break;
+                       }
                }
-
-               /*
-                * Because connect() and send() are non-atomic in a sendto()
-                * with a target address, it's possible that the socket will
-                * have disconnected before the send() can run.  In that case
-                * return the slightly counter-intuitive but otherwise
-                * correct error that the socket is not connected.
-                *
-                * Locking here must be done carefully: the linkage lock
-                * prevents interconnections between unpcbs from changing, so
-                * we can traverse from unp to unp2 without acquiring unp's
-                * lock.  Socket buffer locks follow unpcb locks, so we can
-                * acquire both remote and lock socket buffer locks.
-                */
-               unp2 = unp->unp_conn;
-               if (unp2 == NULL) {
+               unp_pcb_owned_lock2(unp, unp2, freed);
+               UNP_PCB_UNLOCK(unp);
+               if (__predict_false(freed)) {
                        error = ENOTCONN;
                        break;
                }
-               so2 = unp2->unp_socket;
-               UNP_PCB_LOCK(unp2);
+               if ((so2 = unp2->unp_socket) == NULL) {
+                       UNP_PCB_UNLOCK(unp2);
+                       error = ENOTCONN;
+                       break;
+               }
                SOCKBUF_LOCK(&so2->so_rcv);
                if (unp2->unp_flags & UNP_WANTCRED) {
                        /*
@@ -1046,12 +1215,6 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
                unp_shutdown(unp);
                UNP_PCB_UNLOCK(unp);
        }
-
-       if ((nam != NULL) || (flags & PRUS_EOF))
-               UNP_LINK_WUNLOCK();
-       else
-               UNP_LINK_RUNLOCK();
-
        if (control != NULL && error != 0)
                unp_dispose_mbuf(control);
 
@@ -1124,12 +1287,10 @@ uipc_shutdown(struct socket *so)
        unp = sotounpcb(so);
        KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
 
-       UNP_LINK_WLOCK();
        UNP_PCB_LOCK(unp);
        socantsendmore(so);
        unp_shutdown(unp);
        UNP_PCB_UNLOCK(unp);
-       UNP_LINK_WUNLOCK();
        return (0);
 }
 
@@ -1333,16 +1494,11 @@ unp_connectat(int fd, struct socket *so, struct sockad
        char buf[SOCK_MAXADDRLEN];
        struct sockaddr *sa;
        cap_rights_t rights;
-       int error, len;
+       int error, len, freed;
+       struct mtx *vplock;
 
        if (nam->sa_family != AF_UNIX)
                return (EAFNOSUPPORT);
-
-       UNP_LINK_WLOCK_ASSERT();
-
-       unp = sotounpcb(so);
-       KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
-
        if (nam->sa_len > sizeof(struct sockaddr_un))
                return (EINVAL);
        len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
@@ -1351,12 +1507,12 @@ unp_connectat(int fd, struct socket *so, struct sockad
        bcopy(soun->sun_path, buf, len);
        buf[len] = 0;
 
+       unp = sotounpcb(so);
        UNP_PCB_LOCK(unp);
        if (unp->unp_flags & UNP_CONNECTING) {
                UNP_PCB_UNLOCK(unp);
                return (EALREADY);
        }
-       UNP_LINK_WUNLOCK();
        unp->unp_flags |= UNP_CONNECTING;
        UNP_PCB_UNLOCK(unp);
 
@@ -1389,11 +1545,8 @@ unp_connectat(int fd, struct socket *so, struct sockad
        unp = sotounpcb(so);
        KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
-       /*
-        * Lock linkage lock for two reasons: make sure v_socket is stable,
-        * and to protect simultaneous locking of multiple pcbs.
-        */
-       UNP_LINK_WLOCK();
+       vplock = mtx_pool_find(mtxpool_sleep, vp);
+       mtx_lock(vplock);
        VOP_UNP_CONNECT(vp, &unp2);
        if (unp2 == NULL) {
                error = ECONNREFUSED;
@@ -1404,8 +1557,7 @@ unp_connectat(int fd, struct socket *so, struct sockad
                error = EPROTOTYPE;
                goto bad2;
        }
-       UNP_PCB_LOCK(unp);
-       UNP_PCB_LOCK(unp2);
+       unp_pcb_lock2(unp, unp2);
        if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
                if (so2->so_options & SO_ACCEPTCONN) {
                        CURVNET_SET(so2->so_vnet);
@@ -1418,7 +1570,9 @@ unp_connectat(int fd, struct socket *so, struct sockad
                        goto bad3;
                }
                unp3 = sotounpcb(so2);
-               UNP_PCB_LOCK(unp3);
+               UNP_PCB_UNLOCK(unp);
+               unp_pcb_owned_lock2(unp2, unp3, freed);
+               MPASS(!freed);
                if (unp2->unp_addr != NULL) {
                        bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
                        unp3->unp_addr = (struct sockaddr_un *) sa;
@@ -1445,6 +1599,8 @@ unp_connectat(int fd, struct socket *so, struct sockad
                        unp3->unp_flags |= UNP_WANTCRED;
                UNP_PCB_UNLOCK(unp2);
                unp2 = unp3;
+               unp_pcb_owned_lock2(unp2, unp, freed);
+               MPASS(!freed);
 #ifdef MAC
                mac_socketpeer_set_from_socket(so, so2);
                mac_socketpeer_set_from_socket(so2, so);
@@ -1459,12 +1615,12 @@ bad3:
        UNP_PCB_UNLOCK(unp2);
        UNP_PCB_UNLOCK(unp);
 bad2:
-       UNP_LINK_WUNLOCK();
+       mtx_unlock(vplock);
 bad:
-       if (vp != NULL)
+       if (vp != NULL) {
                vput(vp);
+       }
        free(sa, M_SONAME);
-       UNP_LINK_WLOCK();
        UNP_PCB_LOCK(unp);
        unp->unp_flags &= ~UNP_CONNECTING;
        UNP_PCB_UNLOCK(unp);
@@ -1482,7 +1638,6 @@ unp_connect2(struct socket *so, struct socket *so2, in
        unp2 = sotounpcb(so2);
        KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
 
-       UNP_LINK_WLOCK_ASSERT();
        UNP_PCB_LOCK_ASSERT(unp);
        UNP_PCB_LOCK_ASSERT(unp2);
 
@@ -1490,10 +1645,13 @@ unp_connect2(struct socket *so, struct socket *so2, in
                return (EPROTOTYPE);
        unp2->unp_flags &= ~UNP_NASCENT;
        unp->unp_conn = unp2;
-
+       unp_pcb_hold(unp2);
+       unp_pcb_hold(unp);
        switch (so->so_type) {
        case SOCK_DGRAM:
+               UNP_REF_LIST_LOCK();
                LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+               UNP_REF_LIST_UNLOCK();
                soisconnected(so);
                break;
 
@@ -1517,31 +1675,48 @@ unp_connect2(struct socket *so, struct socket *so2, in
 static void
 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
 {
-       struct socket *so;
+       struct socket *so, *so2;
+       int rele, freed;
 
        KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
 
-       UNP_LINK_WLOCK_ASSERT();
        UNP_PCB_LOCK_ASSERT(unp);
        UNP_PCB_LOCK_ASSERT(unp2);
 
+       if (unp->unp_conn == NULL && unp2->unp_conn == NULL)
+               return;
+
+       MPASS(unp->unp_conn == unp2);
        unp->unp_conn = NULL;
+       rele = 0;
+       so = unp->unp_socket;
+       so2 = unp2->unp_socket;
        switch (unp->unp_socket->so_type) {
        case SOCK_DGRAM:
+               UNP_REF_LIST_LOCK();
                LIST_REMOVE(unp, unp_reflink);
-               so = unp->unp_socket;
-               SOCK_LOCK(so);
-               so->so_state &= ~SS_ISCONNECTED;
-               SOCK_UNLOCK(so);
+               UNP_REF_LIST_UNLOCK();
+               if (so) {
+                       SOCK_LOCK(so);
+                       so->so_state &= ~SS_ISCONNECTED;
+                       SOCK_UNLOCK(so);
+               }
                break;
 
        case SOCK_STREAM:
        case SOCK_SEQPACKET:
-               soisdisconnected(unp->unp_socket);
+               if (so)
+                       soisdisconnected(so);
+               MPASS(unp2->unp_conn == unp);
                unp2->unp_conn = NULL;
-               soisdisconnected(unp2->unp_socket);
+               if (so2)
+                       soisdisconnected(so2);
                break;
        }
+       freed = unp_pcb_rele(unp);
+       MPASS(freed == 0);
+       freed = unp_pcb_rele(unp2);
+       MPASS(freed == 0);
 }
 
 /*
@@ -1625,7 +1800,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
                                continue;
                        }
                        unp_list[i++] = unp;
-                       unp->unp_refcount++;
+                       unp_pcb_hold(unp);
                }
                UNP_PCB_UNLOCK(unp);
        }
@@ -1637,8 +1812,9 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
        for (i = 0; i < n; i++) {
                unp = unp_list[i];
                UNP_PCB_LOCK(unp);
-               unp->unp_refcount--;
-               if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
+               freeunp = unp_pcb_rele(unp);
+
+               if (freeunp == 0 && unp->unp_gencnt <= gencnt) {
                        xu->xu_len = sizeof *xu;
                        xu->xu_unpp = unp;
                        /*
@@ -1665,14 +1841,8 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
                        sotoxsocket(unp->unp_socket, &xu->xu_socket);
                        UNP_PCB_UNLOCK(unp);
                        error = SYSCTL_OUT(req, xu, sizeof *xu);
-               } else {
-                       freeunp = (unp->unp_refcount == 0);
+               } else  if (freeunp == 0)
                        UNP_PCB_UNLOCK(unp);
-                       if (freeunp) {
-                               UNP_PCB_LOCK_DESTROY(unp);
-                               uma_zfree(unp_zone, unp);
-                       }
-               }
        }
        free(xu, M_TEMP);
        if (!error) {
@@ -1709,7 +1879,6 @@ unp_shutdown(struct unpcb *unp)
        struct unpcb *unp2;
        struct socket *so;
 
-       UNP_LINK_WLOCK_ASSERT();
        UNP_PCB_LOCK_ASSERT(unp);
 
        unp2 = unp->unp_conn;
@@ -1726,22 +1895,28 @@ unp_drop(struct unpcb *unp)
 {
        struct socket *so = unp->unp_socket;
        struct unpcb *unp2;
+       int freed;
 
-       UNP_LINK_WLOCK_ASSERT();
-       UNP_PCB_LOCK_ASSERT(unp);
-
        /*
         * Regardless of whether the socket's peer dropped the connection
         * with this socket by aborting or disconnecting, POSIX requires
         * that ECONNRESET is returned.
         */
-       so->so_error = ECONNRESET;
+       /* acquire a reference so that unp isn't freed from underneath us */
+
+       UNP_PCB_LOCK(unp);
+       if (so)
+               so->so_error = ECONNRESET;
        unp2 = unp->unp_conn;
-       if (unp2 == NULL)
-               return;
-       UNP_PCB_LOCK(unp2);
-       unp_disconnect(unp, unp2);
-       UNP_PCB_UNLOCK(unp2);
+       if (unp2 != NULL) {
+               unp_pcb_hold(unp2);
+               unp_pcb_owned_lock2(unp, unp2, freed);
+               unp_disconnect(unp, unp2);
+               if (unp_pcb_rele(unp2) == 0)
+                       UNP_PCB_UNLOCK(unp2);
+       }
+       if (unp_pcb_rele(unp) == 0)
+               UNP_PCB_UNLOCK(unp);
 }
 
 static void
@@ -1881,7 +2056,7 @@ unp_init(void)
                return;
 #endif
        unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
-           NULL, NULL, UMA_ALIGN_PTR, 0);
+           NULL, NULL, UMA_ALIGN_CACHE, 0);
        if (unp_zone == NULL)
                panic("unp_init");
        uma_zone_set_max(unp_zone, maxsockets);
@@ -2464,13 +2639,15 @@ vfs_unp_reclaim(struct vnode *vp)
 {
        struct unpcb *unp;
        int active;
+       struct mtx *vplock;
 
        ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
        KASSERT(vp->v_type == VSOCK,
            ("vfs_unp_reclaim: vp->v_type != VSOCK"));
 
        active = 0;
-       UNP_LINK_WLOCK();
+       vplock = mtx_pool_find(mtxpool_sleep, vp);
+       mtx_lock(vplock);
        VOP_UNP_CONNECT(vp, &unp);
        if (unp == NULL)
                goto done;
@@ -2481,8 +2658,8 @@ vfs_unp_reclaim(struct vnode *vp)
                active = 1;
        }
        UNP_PCB_UNLOCK(unp);
-done:
-       UNP_LINK_WUNLOCK();
+ done:
+       mtx_unlock(vplock);
        if (active)
                vunref(vp);
 }

Modified: head/sys/sys/unpcb.h
==============================================================================
--- head/sys/sys/unpcb.h        Thu May 17 17:57:41 2018        (r333743)
+++ head/sys/sys/unpcb.h        Thu May 17 17:59:35 2018        (r333744)
@@ -69,23 +69,25 @@ typedef uint64_t unp_gen_t;
 LIST_HEAD(unp_head, unpcb);
 
 struct unpcb {
-       LIST_ENTRY(unpcb) unp_link;     /* glue on list of all PCBs */
-       struct  socket *unp_socket;     /* pointer back to socket */
-       struct  file *unp_file;         /* back-pointer to file for gc. */
-       struct  vnode *unp_vnode;       /* if associated with file */
-       ino_t   unp_ino;                /* fake inode number */
+       /* Cache line 1 */
+       struct  mtx unp_mtx;            /* mutex */
        struct  unpcb *unp_conn;        /* control block of connected socket */
-       struct  unp_head unp_refs;      /* referencing socket linked list */
-       LIST_ENTRY(unpcb) unp_reflink;  /* link in unp_refs list */
-       struct  sockaddr_un *unp_addr;  /* bound address of socket */
-       unp_gen_t unp_gencnt;           /* generation count of this instance */
+       volatile u_int  unp_refcount;
        short   unp_flags;              /* flags */
        short   unp_gcflag;             /* Garbage collector flags. */
+       struct  sockaddr_un *unp_addr;  /* bound address of socket */
+       struct  socket *unp_socket;     /* pointer back to socket */
+       /* Cache line 2 */
+       struct  vnode *unp_vnode;       /* if associated with file */
        struct  xucred unp_peercred;    /* peer credentials, if applicable */
-       u_int   unp_refcount;
+       LIST_ENTRY(unpcb) unp_reflink;  /* link in unp_refs list */
+       LIST_ENTRY(unpcb) unp_link;     /* glue on list of all PCBs */
+       struct  unp_head unp_refs;      /* referencing socket linked list */
+       unp_gen_t unp_gencnt;           /* generation count of this instance */
+       struct  file *unp_file;         /* back-pointer to file for gc. */
        u_int   unp_msgcount;           /* references from message queue */
-       struct  mtx unp_mtx;            /* mutex */
-};
+       ino_t   unp_ino;                /* fake inode number */
+} __aligned(CACHE_LINE_SIZE);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to