The UNIX sockets garbage collector was moved out of global `unp_lock'
rwlock(9) which locks the whole layer. Now it's used to protect
per-socket data and it's time to replace it to per-socket's `so_lock'.

Unlike PF_ROUTE and PF_KEY sockets we have the paths where multiple
sockets should be locked simultaneously. This provides some problems and
re-lock dances to solve them.

Except the case when the only thread locks multiple sockets the lock
order matters. To avoid the deadlock we always lock first the socket
with the smallest pointer address.

We have cases where one socket `so1' is already locked and the second
socket `so2' we want to lock has the smallest address. We need to unlock
`so1', then lock `so2' and then lock `so1'. In such cases the lock of
`so1' also protects `so2' from being killed by concurrent thread. To
prevent this we get extra reference count on `so2' before re-lock `so1'
and release it after. Also `so1' could change it's state while being
unlocked so we need to check it. This introduces re-lock dances not only
in PCB layer but in sockets layer too.

In the sockets layer PF_INET and PF_INET6 sockets use the same code path
with PF_UNIX sockets. The solock_persocket() function introduced to
determine which locking strategy should be used.

Please test the diff and give me feedback. I'm also interesting in the
positive experience so feel free to report.

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.269
diff -u -p -r1.269 uipc_socket.c
--- sys/kern/uipc_socket.c      11 Nov 2021 16:35:09 -0000      1.269
+++ sys/kern/uipc_socket.c      18 Nov 2021 10:49:43 -0000
@@ -52,6 +52,7 @@
 #include <sys/atomic.h>
 #include <sys/rwlock.h>
 #include <sys/time.h>
+#include <sys/refcnt.h>
 
 #ifdef DDB
 #include <machine/db_machdep.h>
@@ -156,7 +157,9 @@ soalloc(int prflags)
        so = pool_get(&socket_pool, prflags);
        if (so == NULL)
                return (NULL);
-       rw_init(&so->so_lock, "solock");
+       rw_init_flags(&so->so_lock, "solock", RWL_DUPOK);
+       refcnt_init(&so->so_refcnt);
+
        return (so);
 }
 
@@ -257,6 +260,8 @@ solisten(struct socket *so, int backlog)
 void
 sofree(struct socket *so, int s)
 {
+       int persocket = solock_persocket(so);
+
        soassertlocked(so);
 
        if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
@@ -264,16 +269,57 @@ sofree(struct socket *so, int s)
                return;
        }
        if (so->so_head) {
+               struct socket *head = so->so_head;
+
                /*
                 * We must not decommission a socket that's on the accept(2)
                 * queue.  If we do, then accept(2) may hang after select(2)
                 * indicated that the listening socket was ready.
                 */
-               if (!soqremque(so, 0)) {
+               if (so->so_onq == &head->so_q) {
                        sounlock(so, s);
                        return;
                }
+
+               if (persocket) {
+                       if (so < head)
+                               solock(head);
+                       else {
+                               /*
+                                * Concurrent close of `head' could
+                                * abort `so' due to re-lock.
+                                */
+                               soref(so);
+                               soref(head);
+                               sounlock(so, SL_LOCKED);
+                               solock(head);
+                               solock(so);
+
+                               if (so->so_onq != &head->so_q0) {
+                                       sounlock(head, SL_LOCKED);
+                                       sounlock(so, SL_LOCKED);
+                                       sorele(head);
+                                       sorele(so);
+                                       return;
+                               }
+
+                               sorele(head);
+                               sorele(so);
+                       }
+               }
+
+               soqremque(so, 0);
+
+               if (persocket)
+                       sounlock(head, SL_LOCKED);
        }
+
+       if (persocket) {
+               sounlock(so, SL_LOCKED);
+               refcnt_finalize(&so->so_refcnt, "sofinal");
+               solock(so);
+       }
+
        sigio_free(&so->so_sigio);
        klist_free(&so->so_rcv.sb_sel.si_note);
        klist_free(&so->so_snd.sb_sel.si_note);
@@ -363,13 +409,68 @@ drop:
                        error = error2;
        }
        if (so->so_options & SO_ACCEPTCONN) {
+               int persocket = solock_persocket(so);
+
+               if (persocket) {
+                       /* Wait concurrent sonewconn() threads. */
+                       while (so->so_newconn > 0) {
+                               so->so_state |= SS_NEWCONN_WAIT;
+                               sosleep_nsec(so, &so->so_newconn, PSOCK,
+                                       "netlck", INFSLP);
+                       }
+               }
+
                while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
+                       if (persocket) {
+                               if (so < so2)
+                                       solock(so2);
+                               else {
+                                       soref(so2);
+                                       sounlock(so, SL_LOCKED);
+                                       solock(so2);
+                                       solock(so);
+
+                                       if (so2->so_onq != &so->so_q0) {
+                                               sounlock(so2, SL_LOCKED);
+                                               sorele(so2);
+                                               continue;
+                                       }
+
+                                       sorele(so2);
+                               }
+                       }
                        (void) soqremque(so2, 0);
+                       if (persocket)
+                               sounlock(so, SL_LOCKED);
                        (void) soabort(so2);
+                       if (persocket)
+                               solock(so);
                }
                while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
+                       if (persocket) {
+                               if (so < so2)
+                                       solock(so2);
+                               else {
+                                       soref(so2);
+                                       sounlock(so, SL_LOCKED);
+                                       solock(so2);
+                                       solock(so);
+
+                                       if (so2->so_onq != &so->so_q) {
+                                               sounlock(so2, SL_LOCKED);
+                                               sorele(so2);
+                                               continue;
+                                       }
+
+                                       sorele(so2);
+                               }
+                       }
                        (void) soqremque(so2, 1);
+                       if (persocket)
+                               sounlock(so, SL_LOCKED);
                        (void) soabort(so2);
+                       if (persocket)
+                               solock(so);
                }
        }
 discard:
@@ -437,11 +538,19 @@ soconnect(struct socket *so, struct mbuf
 int
 soconnect2(struct socket *so1, struct socket *so2)
 {
-       int s, error;
+       int persocket, s, error;
+
+       if ((persocket = solock_persocket(so1))) {
+               solock_pair(so1, so2);
+               s = SL_LOCKED;
+       } else
+               s = solock(so1);
 
-       s = solock(so1);
        error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
            (struct mbuf *)so2, NULL, curproc);
+
+       if (persocket)
+               sounlock(so2, s);
        sounlock(so1, s);
        return (error);
 }
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.116
diff -u -p -r1.116 uipc_socket2.c
--- sys/kern/uipc_socket2.c     6 Nov 2021 05:26:33 -0000       1.116
+++ sys/kern/uipc_socket2.c     18 Nov 2021 10:49:43 -0000
@@ -53,8 +53,6 @@ u_long        sb_max = SB_MAX;                /* patchable */
 extern struct pool mclpools[];
 extern struct pool mbpool;
 
-extern struct rwlock unp_lock;
-
 /*
  * Procedures to manipulate state flags of socket
  * and do appropriate wakeups.  Normal sequence from the
@@ -101,10 +99,42 @@ soisconnected(struct socket *so)
        soassertlocked(so);
        so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
        so->so_state |= SS_ISCONNECTED;
-       if (head && soqremque(so, 0)) {
+
+       if (head != NULL && so->so_onq == &head->so_q0) {
+               int persocket = solock_persocket(so);
+
+               if (persocket) {
+                       if (so < head)
+                               solock(head);
+                       else {
+                               soref(so);
+                               soref(head);
+
+                               sounlock(so, SL_LOCKED);
+                               solock(head);
+                               solock(so);
+
+                               if (so->so_onq != &head->so_q0) {
+                                       sounlock(head, SL_LOCKED);
+                                       sounlock(so, SL_LOCKED);
+                                       sorele(head);
+                                       sorele(so);
+
+                                       return;
+                               }
+
+                               sorele(head);
+                               sorele(so);
+                       }
+               }
+               
+               soqremque(so, 0);
                soqinsque(head, so, 1);
                sorwakeup(head);
                wakeup_one(&head->so_timeo);
+
+               if (persocket)
+                       sounlock(head, SL_LOCKED);
        } else {
                wakeup(&so->so_timeo);
                sorwakeup(so);
@@ -146,7 +176,8 @@ struct socket *
 sonewconn(struct socket *head, int connstatus)
 {
        struct socket *so;
-       int soqueue = connstatus ? 1 : 0;
+       int persocket = solock_persocket(head);
+       int error;
 
        /*
         * XXXSMP as long as `so' and `head' share the same lock, we
@@ -174,10 +205,20 @@ sonewconn(struct socket *head, int conns
        so->so_rgid = head->so_rgid;
        so->so_cpid = head->so_cpid;
 
+       if (persocket) {
+               /*
+                * Lock order doesn't matter. We are the only thread
+                * which simultaneously locks these sockets.
+                */
+               solock(so);
+       }
+
        /*
         * Inherit watermarks but those may get clamped in low mem situations.
         */
        if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+               if (persocket)
+                       sounlock(so, SL_LOCKED);
                pool_put(&socket_pool, so);
                return (NULL);
        }
@@ -193,20 +234,56 @@ sonewconn(struct socket *head, int conns
        sigio_init(&so->so_sigio);
        sigio_copy(&so->so_sigio, &head->so_sigio);
 
-       soqinsque(head, so, soqueue);
-       if ((*so->so_proto->pr_attach)(so, 0)) {
-               (void) soqremque(so, soqueue);
+       soqinsque(head, so, 0);
+
+       /*
+        * We need to unlock `head' because PCB layer could release
+        * solock() to enforce desired lock order.
+        */
+       if (persocket) {
+               head->so_newconn++;
+               sounlock(head, SL_LOCKED);
+       }
+
+       error = (*so->so_proto->pr_attach)(so, 0);
+
+       /*
+        * The lock order matters from here.
+        */
+       if (persocket) {
+               sounlock(so, SL_LOCKED);
+               solock_pair(head, so);
+
+               if ((head->so_newconn--) == 0) {
+                       if ((head->so_state & SS_NEWCONN_WAIT) != 0) {
+                               head->so_state &= ~SS_NEWCONN_WAIT;
+                               wakeup(&head->so_newconn);
+                       }
+               }
+       }
+
+       if (error) {
+               soqremque(so, 0);
+               if (persocket)
+                       sounlock(so, SL_LOCKED);
                sigio_free(&so->so_sigio);
                klist_free(&so->so_rcv.sb_sel.si_note);
                klist_free(&so->so_snd.sb_sel.si_note);
                pool_put(&socket_pool, so);
                return (NULL);
        }
+
        if (connstatus) {
+               so->so_state |= connstatus;
+               soqremque(so, 0);
+               soqinsque(head, so, 1);
                sorwakeup(head);
                wakeup(&head->so_timeo);
-               so->so_state |= connstatus;
        }
+
+       if (persocket)
+               sounlock(so, SL_LOCKED);
+
        return (so);
 }
 
@@ -214,6 +291,7 @@ void
 soqinsque(struct socket *head, struct socket *so, int q)
 {
        soassertlocked(head);
+       soassertlocked(so);
 
        KASSERT(so->so_onq == NULL);
 
@@ -233,6 +311,7 @@ soqremque(struct socket *so, int q)
 {
        struct socket *head = so->so_head;
 
+       soassertlocked(so);
        soassertlocked(head);
 
        if (q == 0) {
@@ -284,9 +363,6 @@ solock(struct socket *so)
        case PF_INET6:
                NET_LOCK();
                break;
-       case PF_UNIX:
-               rw_enter_write(&unp_lock);
-               break;
        default:
                rw_enter_write(&so->so_lock);
                break;
@@ -295,6 +371,34 @@ solock(struct socket *so)
        return (SL_LOCKED);
 }
 
+int
+solock_persocket(struct socket *so)
+{
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               return 0;
+       default:
+               return 1;
+       }
+}
+
+void
+solock_pair(struct socket *so1, struct socket *so2)
+{
+       KASSERT(so1 != so2);
+       KASSERT(so1->so_type == so2->so_type);
+       KASSERT(solock_persocket(so1));
+
+       if (so1 < so2) {
+               solock(so1);
+               solock(so2);
+       } else {
+               solock(so2);
+               solock(so1);
+       }
+}
+
 void
 sounlock(struct socket *so, int s)
 {
@@ -308,9 +412,6 @@ sounlock(struct socket *so, int s)
        case PF_INET6:
                NET_UNLOCK();
                break;
-       case PF_UNIX:
-               rw_exit_write(&unp_lock);
-               break;
        default:
                rw_exit_write(&so->so_lock);
                break;
@@ -325,9 +426,6 @@ soassertlocked(struct socket *so)
        case PF_INET6:
                NET_ASSERT_LOCKED();
                break;
-       case PF_UNIX:
-               rw_assert_wrlock(&unp_lock);
-               break;
        default:
                rw_assert_wrlock(&so->so_lock);
                break;
@@ -344,9 +442,6 @@ sosleep_nsec(struct socket *so, void *id
        case PF_INET:
        case PF_INET6:
                ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
-               break;
-       case PF_UNIX:
-               ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs);
                break;
        default:
                ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
Index: sys/kern/uipc_syscalls.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.194
diff -u -p -r1.194 uipc_syscalls.c
--- sys/kern/uipc_syscalls.c    24 Oct 2021 00:02:25 -0000      1.194
+++ sys/kern/uipc_syscalls.c    18 Nov 2021 10:49:43 -0000
@@ -246,7 +246,7 @@ doaccept(struct proc *p, int sock, struc
        socklen_t namelen;
        int error, s, tmpfd;
        struct socket *head, *so;
-       int cloexec, nflag;
+       int cloexec, nflag, persocket;
 
        cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0;
 
@@ -269,16 +269,20 @@ doaccept(struct proc *p, int sock, struc
 
        head = headfp->f_data;
        s = solock(head);
+
+       persocket = solock_persocket(head);
+
        if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
                error = EINVAL;
-               goto out;
+               goto out_unlock;
        }
+retry:
        if ((headfp->f_flag & FNONBLOCK) && head->so_qlen == 0) {
                if (head->so_state & SS_CANTRCVMORE)
                        error = ECONNABORTED;
                else
                        error = EWOULDBLOCK;
-               goto out;
+               goto out_unlock;
        }
        while (head->so_qlen == 0 && head->so_error == 0) {
                if (head->so_state & SS_CANTRCVMORE) {
@@ -288,18 +292,40 @@ doaccept(struct proc *p, int sock, struc
                error = sosleep_nsec(head, &head->so_timeo, PSOCK | PCATCH,
                    "netcon", INFSLP);
                if (error)
-                       goto out;
+                       goto out_unlock;
        }
        if (head->so_error) {
                error = head->so_error;
                head->so_error = 0;
-               goto out;
+               goto out_unlock;
        }
 
        /*
         * Do not sleep after we have taken the socket out of the queue.
         */
-       so = TAILQ_FIRST(&head->so_q);
+
+       if ((so = TAILQ_FIRST(&head->so_q)) == NULL)
+               panic("accept");
+
+       if (persocket) {
+               if (head < so)
+                       solock(so);
+               else {
+                       soref(so);
+                       sounlock(head, SL_LOCKED);
+                       solock(so);
+                       solock(head);
+
+                       if (so->so_head != head) {
+                               sounlock(so, SL_LOCKED);
+                               sorele(so);
+                               goto retry;
+                       }
+
+                       sorele(so);
+               }
+       }
+
        if (soqremque(so, 1) == 0)
                panic("accept");
 
@@ -310,30 +336,53 @@ doaccept(struct proc *p, int sock, struc
        /* connection has been removed from the listen queue */
        KNOTE(&head->so_rcv.sb_sel.si_note, 0);
 
+       if (persocket)
+               sounlock(head, s);
+
        fp->f_type = DTYPE_SOCKET;
        fp->f_flag = FREAD | FWRITE | nflag;
        fp->f_ops = &socketops;
        fp->f_data = so;
+
        error = soaccept(so, nam);
-out:
-       sounlock(head, s);
-       if (!error && name != NULL)
+
+       /*
+        * It doesn't matter which socket to unlock when we
+        * locked the whole layer.
+        */
+       sounlock(so, s);
+
+       if (error)
+               goto out;
+
+       if (name != NULL) {
                error = copyaddrout(p, nam, name, namelen, anamelen);
-       if (!error) {
-               fdplock(fdp);
-               fdinsert(fdp, tmpfd, cloexec, fp);
-               fdpunlock(fdp);
-               FRELE(fp, p);
-               *retval = tmpfd;
-       } else {
-               fdplock(fdp);
-               fdremove(fdp, tmpfd);
-               fdpunlock(fdp);
-               closef(fp, p);
+               if (error)
+                       goto out;
        }
 
+       fdplock(fdp);
+       fdinsert(fdp, tmpfd, cloexec, fp);
+       fdpunlock(fdp);
+       FRELE(fp, p);
+       *retval = tmpfd;
+
        m_freem(nam);
        FRELE(headfp, p);
+
+       return 0;
+
+out_unlock:
+       sounlock(head, s);
+out:
+       fdplock(fdp);
+       fdremove(fdp, tmpfd);
+       fdpunlock(fdp);
+       closef(fp, p);
+
+       m_freem(nam);
+       FRELE(headfp, p);
+
        return (error);
 }
 
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.158
diff -u -p -r1.158 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c      17 Nov 2021 22:56:19 -0000      1.158
+++ sys/kern/uipc_usrreq.c      18 Nov 2021 10:49:43 -0000
@@ -52,7 +52,6 @@
 #include <sys/pledge.h>
 #include <sys/pool.h>
 #include <sys/rwlock.h>
-#include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 
@@ -61,16 +60,17 @@
  *      I       immutable after creation
  *      D       unp_df_lock
  *      G       unp_gc_lock
- *      U       unp_lock
+ *      M       unp_ino_mtx
  *      R       unp_rights_mtx
  *      a       atomic
+ *      s       socket lock
  */
 
-struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
 struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
 struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
 
 struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
+struct mutex unp_ino_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
 /*
  * Stack of sets of files that were passed over a socket but were
@@ -88,6 +88,7 @@ void  unp_discard(struct fdpass *, int);
 void   unp_mark(struct fdpass *, int);
 void   unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
 int    unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
+int    unp_solock_peer(struct socket *, struct socket **);
 
 struct pool unpcb_pool;
 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
@@ -121,6 +122,54 @@ unp_init(void)
            IPL_SOFTNET, 0, "unpcb", NULL);
 }
 
+int
+unp_solock_peer(struct socket *so, struct socket **ret)
+{
+       struct unpcb *unp;
+       struct socket *so2;
+
+       unp = so->so_pcb;
+
+       if (unp->unp_conn == NULL)
+               return ENOTCONN;
+
+       so2 = unp->unp_conn->unp_socket;
+
+       if (so < so2)
+               solock(so2);
+       else if (so > so2){
+               struct unpcb *unp2;
+
+               soref(so2);
+               sounlock(so, SL_LOCKED);
+
+               solock(so2);
+               solock(so);
+
+               unp2 = so2->so_pcb;
+
+               /*
+                * The peer could be closed due to re-lock. In the case
+                * `so' is the datagram socket it could be connected
+                * again by concurrent PRU_CONNECT thread to another
+                * datagram socket. This socket could be newly created
+                * and have `so_pcb' with the address our previous peer
+                * had.
+                */
+               if (unp->unp_conn != unp2 || unp2 == NULL) {
+                       sounlock(so2, SL_LOCKED);
+                       sorele(so2);
+                       return EPIPE;
+               }
+
+               sorele(so2);
+       }
+
+       *ret = so2;
+
+       return 0;
+}
+
 void
 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
 {
@@ -195,6 +244,12 @@ uipc_usrreq(struct socket *so, int req, 
                 * if it was bound and we are still connected
                 * (our peer may have closed already!).
                 */
+               /*
+                * Don't need to lock `unp_conn'. The `unp_addr' is
+                * immutable since we set it within unp_connect().
+                * Both sockets are locked while we connecting them
+                * so it's enough to hold lock on `unp'.
+                */
                uipc_setaddr(unp->unp_conn, nam);
                break;
 
@@ -212,9 +267,8 @@ uipc_usrreq(struct socket *so, int req, 
 
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
-                       if (unp->unp_conn == NULL)
+                       if ((unp_solock_peer(so, &so2)) != 0)
                                break;
-                       so2 = unp->unp_conn->unp_socket;
                        /*
                         * Adjust backpressure on sender
                         * and wakeup any waiting to write.
@@ -222,6 +276,7 @@ uipc_usrreq(struct socket *so, int req, 
                        so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
                        so2->so_snd.sb_cc = so->so_rcv.sb_cc;
                        sowwakeup(so2);
+                       sounlock(so2, SL_LOCKED);
                        break;
 
                default:
@@ -250,13 +305,14 @@ uipc_usrreq(struct socket *so, int req, 
                                error = unp_connect(so, nam, p);
                                if (error)
                                        break;
-                       } else {
-                               if (unp->unp_conn == NULL) {
-                                       error = ENOTCONN;
-                                       break;
-                               }
                        }
-                       so2 = unp->unp_conn->unp_socket;
+
+                       if ((error = unp_solock_peer(so, &so2)) != 0) {
+                               if (nam != NULL)
+                                       error = ECONNREFUSED;
+                               break;
+                       }
+
                        if (unp->unp_addr)
                                from = mtod(unp->unp_addr, struct sockaddr *);
                        else
@@ -267,6 +323,10 @@ uipc_usrreq(struct socket *so, int req, 
                                control = NULL;
                        } else
                                error = ENOBUFS;
+
+                       if (so2 != so)
+                               sounlock(so2, SL_LOCKED);
+
                        if (nam)
                                unp_disconnect(unp);
                        break;
@@ -278,11 +338,10 @@ uipc_usrreq(struct socket *so, int req, 
                                error = EPIPE;
                                break;
                        }
-                       if (unp->unp_conn == NULL) {
-                               error = ENOTCONN;
+                       if ((error = unp_solock_peer(so, &so2)) != 0) {
                                break;
                        }
-                       so2 = unp->unp_conn->unp_socket;
+
                        /*
                         * Send to paired receive port, and then raise
                         * send buffer counts to maintain backpressure.
@@ -304,6 +363,8 @@ uipc_usrreq(struct socket *so, int req, 
                        so->so_snd.sb_cc = so2->so_rcv.sb_cc;
                        if (so2->so_rcv.sb_cc > 0)
                                sorwakeup(so2);
+
+                       sounlock(so2, SL_LOCKED);
                        m = NULL;
                        break;
 
@@ -317,12 +378,7 @@ uipc_usrreq(struct socket *so, int req, 
 
        case PRU_ABORT:
                unp_detach(unp);
-               /*
-                * As long as `unp_lock' is taken before entering
-                * uipc_usrreq() releasing it here would lead to a
-                * double unlock.
-                */
-               sofree(so, SL_NOUNLOCK);
+               sofree(so, SL_LOCKED);
                break;
 
        case PRU_SENSE: {
@@ -330,8 +386,10 @@ uipc_usrreq(struct socket *so, int req, 
 
                sb->st_blksize = so->so_snd.sb_hiwat;
                sb->st_dev = NODEV;
+               mtx_enter(&unp_ino_mtx);
                if (unp->unp_ino == 0)
                        unp->unp_ino = unp_ino++;
+               mtx_leave(&unp_ino_mtx);
                sb->st_atim.tv_sec =
                    sb->st_mtim.tv_sec =
                    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
@@ -352,6 +410,12 @@ uipc_usrreq(struct socket *so, int req, 
                break;
 
        case PRU_PEERADDR:
+               /*
+                * Don't need to lock `unp_conn'. The `unp_addr' is
+                * immutable since we set it within unp_connect().
+                * Both sockets are locked while we connecting them
+                * so it's enough to hold lock on `unp'.
+                */
                uipc_setaddr(unp->unp_conn, nam);
                break;
 
@@ -404,8 +468,6 @@ uipc_attach(struct socket *so, int proto
        struct unpcb *unp;
        int error;
 
-       rw_assert_wrlock(&unp_lock);
-
        if (so->so_pcb)
                return EISCONN;
        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -439,12 +501,6 @@ uipc_attach(struct socket *so, int proto
        /*
         * Enforce `unp_gc_lock' -> `solock()' lock order.
         */
-       /*
-        * We also release the lock on listening socket and on our peer
-        * socket when called from unp_connect(). This is safe. The
-        * listening socket protected by vnode(9) lock. The peer socket
-        * has 'UNP_CONNECTING' flag set.
-        */
        sounlock(so, SL_LOCKED);
        rw_enter_write(&unp_gc_lock);
        LIST_INSERT_HEAD(&unp_head, unp, unp_link);
@@ -506,14 +562,13 @@ unp_detach(struct unpcb *unp)
 {
        struct socket *so = unp->unp_socket;
        struct vnode *vp = unp->unp_vnode;
-
-       rw_assert_wrlock(&unp_lock);
+       struct unpcb *unp2;
 
        unp->unp_vnode = NULL;
 
        /*
         * Enforce `unp_gc_lock' -> `solock()' lock order.
-        * Enforce `i_lock' -> `unp_lock' lock order.
+        * Enforce `i_lock' -> `solock()' lock order.
         */
        sounlock(so, SL_LOCKED);
 
@@ -532,10 +587,40 @@ unp_detach(struct unpcb *unp)
 
        solock(so);
 
-       if (unp->unp_conn)
+       if (unp->unp_conn != NULL) {
+               /*
+                * Datagram socket could be connected to itself.
+                * Such socket will be disconnected here.
+                */
                unp_disconnect(unp);
-       while (!SLIST_EMPTY(&unp->unp_refs))
-               unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
+       }
+
+       while ((unp2 = SLIST_FIRST(&unp->unp_refs)) != NULL) {
+               struct socket *so2 = unp2->unp_socket;
+
+               soref(so2);
+               sounlock(so, SL_LOCKED);
+               solock(so2);
+
+               /*
+                * soref() doesn't protect `so_pcb'. `unp2' 
+                * derefernce is not safe after sounlock().
+                */
+               unp2 = so2->so_pcb;
+
+               /*
+                * The datagram socket could be closed. Also it
+                * could be disconnected from us and connected
+                * again to another socket.
+                */
+               if (unp2 != NULL && unp2->unp_conn == unp)
+                       unp_drop(unp2, ECONNRESET);
+
+               sounlock(so2, SL_LOCKED);
+               sorele(so2);
+               solock(so);
+       }
+
        soisdisconnected(so);
        so->so_pcb = NULL;
        m_freem(unp->unp_addr);
@@ -675,24 +760,42 @@ unp_connect(struct socket *so, struct mb
        }
        if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
                goto put;
-       solock(so);
        so2 = vp->v_socket;
        if (so2 == NULL) {
                error = ECONNREFUSED;
-               goto put_locked;
+               goto put;
        }
        if (so->so_type != so2->so_type) {
                error = EPROTOTYPE;
-               goto put_locked;
+               goto put;
        }
+
        if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+               solock(so2);
+
                if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
                    (so3 = sonewconn(so2, 0)) == NULL) {
                        error = ECONNREFUSED;
-                       goto put_locked;
                }
+
+               sounlock(so2, SL_LOCKED);
+
+               if (error != 0)
+                       goto put;
+
+               /*
+                * Since `so2' is protected by vnode(9) lock, `so3'
+                * can't be PRU_ABORT'ed here.
+                */
+               solock_pair(so, so3);
+
                unp2 = sotounpcb(so2);
                unp3 = sotounpcb(so3);
+
+               /*
+                * `unp_addr', `unp_connid' and 'UNP_FEIDSBIND' flag
+                * are immutable since we set them in unp_bind().
+                */
                if (unp2->unp_addr)
                        unp3->unp_addr =
                            m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
@@ -700,15 +803,29 @@ unp_connect(struct socket *so, struct mb
                unp3->unp_connid.gid = p->p_ucred->cr_gid;
                unp3->unp_connid.pid = p->p_p->ps_pid;
                unp3->unp_flags |= UNP_FEIDS;
-               so2 = so3;
+
                if (unp2->unp_flags & UNP_FEIDSBIND) {
                        unp->unp_connid = unp2->unp_connid;
                        unp->unp_flags |= UNP_FEIDS;
                }
+
+               so2 = so3;
+       } else {
+               if (so2 != so)
+                       solock_pair(so, so2);
+               else
+                       solock(so);
        }
+
        error = unp_connect2(so, so2);
-put_locked:
+
        sounlock(so, SL_LOCKED);
+
+       /*
+        * `so2' can't be PRU_ABORT'ed concurrently
+        */
+       if (so2 != so)
+               sounlock(so2, SL_LOCKED);
 put:
        vput(vp);
 unlock:
@@ -732,7 +849,8 @@ unp_connect2(struct socket *so, struct s
        struct unpcb *unp = sotounpcb(so);
        struct unpcb *unp2;
 
-       rw_assert_wrlock(&unp_lock);
+       soassertlocked(so);
+       soassertlocked(so2);
 
        if (so2->so_type != so->so_type)
                return (EPROTOTYPE);
@@ -747,6 +865,13 @@ unp_connect2(struct socket *so, struct s
 
        case SOCK_STREAM:
        case SOCK_SEQPACKET:
+               /*
+                * soisconnected() could re-lock `so2' and will lock
+                * 'so2->so_head'. `so2' is not yet exposed to userland
+                * and we can't have concurrent thread which locks `so2'
+                * and then `so'. Also we can't have concurrent thread
+                * which simultaneously locks `so' and 'so2->so_head'.
+                */
                unp2->unp_conn = unp;
                soisconnected(so);
                soisconnected(so2);
@@ -761,11 +886,15 @@ unp_connect2(struct socket *so, struct s
 void
 unp_disconnect(struct unpcb *unp)
 {
-       struct unpcb *unp2 = unp->unp_conn;
+       struct socket *so2;
+       struct unpcb *unp2;
 
-       if (unp2 == NULL)
+       if ((unp_solock_peer(unp->unp_socket, &so2)) != 0)
                return;
+
+       unp2 = unp->unp_conn;
        unp->unp_conn = NULL;
+
        switch (unp->unp_socket->so_type) {
 
        case SOCK_DGRAM:
@@ -784,18 +913,23 @@ unp_disconnect(struct unpcb *unp)
                soisdisconnected(unp2->unp_socket);
                break;
        }
+
+       if (so2 != unp->unp_socket)
+               sounlock(so2, SL_LOCKED);
 }
 
 void
 unp_shutdown(struct unpcb *unp)
 {
-       struct socket *so;
+       struct socket *so2;
 
        switch (unp->unp_socket->so_type) {
        case SOCK_STREAM:
        case SOCK_SEQPACKET:
-               if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
-                       socantrcvmore(so);
+               if ((unp_solock_peer(unp->unp_socket, &so2)) == 0) {
+                       socantrcvmore(so2);
+                       sounlock(so2, SL_LOCKED);
+               }
                break;
        default:
                break;
@@ -807,7 +941,7 @@ unp_drop(struct unpcb *unp, int errno)
 {
        struct socket *so = unp->unp_socket;
 
-       rw_assert_wrlock(&unp_lock);
+       soassertlocked(so);
 
        so->so_error = errno;
        unp_disconnect(unp);
Index: sys/miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.85
diff -u -p -r1.85 fifo_vnops.c
--- sys/miscfs/fifofs/fifo_vnops.c      24 Oct 2021 11:23:22 -0000      1.85
+++ sys/miscfs/fifofs/fifo_vnops.c      18 Nov 2021 10:49:43 -0000
@@ -156,7 +156,7 @@ fifo_open(void *v)
        struct vnode *vp = ap->a_vp;
        struct fifoinfo *fip;
        struct socket *rso, *wso;
-       int s, error;
+       int error;
 
        if ((fip = vp->v_fifoinfo) == NULL) {
                fip = malloc(sizeof(*fip), M_VNODE, M_WAITOK);
@@ -182,18 +182,20 @@ fifo_open(void *v)
                        return (error);
                }
                fip->fi_readers = fip->fi_writers = 0;
-               s = solock(wso);
+               solock(wso);
                wso->so_state |= SS_CANTSENDMORE;
                wso->so_snd.sb_lowat = PIPE_BUF;
+               sounlock(wso, SL_LOCKED);
        } else {
                rso = fip->fi_readsock;
                wso = fip->fi_writesock;
-               s = solock(wso);
        }
        if (ap->a_mode & FREAD) {
                fip->fi_readers++;
                if (fip->fi_readers == 1) {
+                       solock(wso);
                        wso->so_state &= ~SS_CANTSENDMORE;
+                       sounlock(wso, SL_LOCKED);
                        if (fip->fi_writers > 0)
                                wakeup(&fip->fi_writers);
                }
@@ -202,16 +204,16 @@ fifo_open(void *v)
                fip->fi_writers++;
                if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) {
                        error = ENXIO;
-                       sounlock(wso, s);
                        goto bad;
                }
                if (fip->fi_writers == 1) {
+                       solock(rso);
                        rso->so_state &= ~(SS_CANTRCVMORE|SS_ISDISCONNECTED);
+                       sounlock(rso, SL_LOCKED);
                        if (fip->fi_readers > 0)
                                wakeup(&fip->fi_readers);
                }
        }
-       sounlock(wso, s);
        if ((ap->a_mode & O_NONBLOCK) == 0) {
                if ((ap->a_mode & FREAD) && fip->fi_writers == 0) {
                        VOP_UNLOCK(vp);
@@ -327,17 +329,16 @@ fifo_poll(void *v)
        struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock;
        int events = 0;
        int revents = 0;
-       int s;
 
        /*
         * FIFOs don't support out-of-band or high priority data.
         */
-       s = solock(rso);
        if (ap->a_fflag & FREAD)
                events |= ap->a_events & (POLLIN | POLLRDNORM);
        if (ap->a_fflag & FWRITE)
                events |= ap->a_events & (POLLOUT | POLLWRNORM);
 
+       solock_pair(rso, wso);
        if (events & (POLLIN | POLLRDNORM)) {
                if (soreadable(rso))
                        revents |= events & (POLLIN | POLLRDNORM);
@@ -362,7 +363,8 @@ fifo_poll(void *v)
                        wso->so_snd.sb_flags |= SB_SEL;
                }
        }
-       sounlock(rso, s);
+       sounlock(rso, SL_LOCKED);
+       sounlock(wso, SL_LOCKED);
        return (revents);
 }
 
Index: sys/sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.101
diff -u -p -r1.101 socketvar.h
--- sys/sys/socketvar.h 6 Nov 2021 05:26:33 -0000       1.101
+++ sys/sys/socketvar.h 18 Nov 2021 10:49:43 -0000
@@ -38,6 +38,7 @@
 #include <sys/task.h>
 #include <sys/timeout.h>
 #include <sys/rwlock.h>
+#include <sys/refcnt.h>
 
 #ifndef        _SOCKLEN_T_DEFINED_
 #define        _SOCKLEN_T_DEFINED_
@@ -55,6 +56,7 @@ TAILQ_HEAD(soqhead, socket);
 struct socket {
        const struct protosw *so_proto; /* protocol handle */
        struct rwlock so_lock;          /* this socket lock */
+       struct refcnt so_refcnt;        /* references to this socket */
        void    *so_pcb;                /* protocol control block */
        u_int   so_state;               /* internal state flags SS_*, below */
        short   so_type;                /* generic type, see socket.h */
@@ -80,6 +82,7 @@ struct socket {
        short   so_q0len;               /* partials on so_q0 */
        short   so_qlen;                /* number of connections on so_q */
        short   so_qlimit;              /* max number queued connections */
+       u_long  so_newconn;             /* # of pending sonewconn() threads */
        short   so_timeo;               /* connection timeout */
        u_long  so_oobmark;             /* chars to oob mark */
        u_int   so_error;               /* error affecting connection */
@@ -150,6 +153,7 @@ struct socket {
 #define        SS_CONNECTOUT           0x1000  /* connect, not accept, at this 
end */
 #define        SS_ISSENDING            0x2000  /* hint for lower layer */
 #define        SS_DNS                  0x4000  /* created using SOCK_DNS 
socket(2) */
+#define        SS_NEWCONN_WAIT         0x8000  /* waiting sonewconn() relock */
 
 #ifdef _KERNEL
 
@@ -163,6 +167,18 @@ struct socket {
 
 void   soassertlocked(struct socket *);
 
+static inline void
+soref(struct socket *so)
+{
+       refcnt_take(&so->so_refcnt);
+}
+
+static inline void
+sorele(struct socket *so)
+{
+       refcnt_rele_wake(&so->so_refcnt);
+}
+
 /*
  * Macros for sockets and socket buffering.
  */
@@ -337,6 +353,8 @@ int sockargs(struct mbuf **, const void 
 
 int    sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
 int    solock(struct socket *);
+int    solock_persocket(struct socket *);
+void   solock_pair(struct socket *, struct socket *);
 void   sounlock(struct socket *, int);
 
 int    sendit(struct proc *, int, struct msghdr *, int, register_t *);
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.20
diff -u -p -r1.20 unpcb.h
--- sys/sys/unpcb.h     16 Nov 2021 08:56:20 -0000      1.20
+++ sys/sys/unpcb.h     18 Nov 2021 10:49:43 -0000
@@ -60,24 +60,25 @@
  * Locks used to protect struct members:
  *      I       immutable after creation
  *      G       unp_gc_lock
- *      U       unp_lock
  *      a       atomic
+ *      s       socket lock
  */
 
 
 struct unpcb {
        struct  socket *unp_socket;     /* [I] pointer back to socket */
-       struct  vnode *unp_vnode;       /* [U] if associated with file */
+       struct  vnode *unp_vnode;       /* [s] if associated with file */
        struct  file *unp_file;         /* [a] backpointer for unp_gc() */
-       struct  unpcb *unp_conn;        /* [U] control block of connected 
socket */
-       ino_t   unp_ino;                /* [U] fake inode number */
-       SLIST_HEAD(,unpcb) unp_refs;    /* [U] referencing socket linked list */
-       SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
-       struct  mbuf *unp_addr;         /* [U] bound address of socket */
+       struct  unpcb *unp_conn;        /* [s] control block of connected
+                                               socket */
+       ino_t   unp_ino;                /* [s] fake inode number */
+       SLIST_HEAD(,unpcb) unp_refs;    /* [s] referencing socket linked list */
+       SLIST_ENTRY(unpcb) unp_nextref; /* [s] link in unp_refs list */
+       struct  mbuf *unp_addr;         /* [s] bound address of socket */
        long    unp_msgcount;           /* [a] references from socket rcv buf */
-       int     unp_flags;              /* [U] this unpcb contains peer eids */
-       int     unp_gcflags;            /* [G] garbge collector flags */
-       struct  sockpeercred unp_connid;/* [U] id of peer process */
+       int     unp_flags;              /* [s] this unpcb contains peer eids */
+       int     unp_gcflags;            /* [G] garbage collector flags */
+       struct  sockpeercred unp_connid;/* [s] id of peer process */
        struct  timespec unp_ctime;     /* [I] holds creation time */
        LIST_ENTRY(unpcb) unp_link;     /* [G] link in per-AF list of sockets */
 };

Reply via email to