The UNIX sockets garbage collector was moved out of global `unp_lock' rwlock(9) which locks the whole layer. Now it's used to protect per-socket data and it's time to replace it to per-socket's `so_lock'.
Unlike PF_ROUTE and PF_KEY sockets we have the paths where multiple sockets should be locked simultaneously. This provides some problems and re-lock dances to solve them. Except the case when the only thread locks multiple sockets the lock order matters. To avoid the deadlock we always lock first the socket with the smallest pointer address. We have cases where one socket `so1' is already locked and the second socket `so2' we want to lock has the smallest address. We need to unlock `so1', then lock `so2' and then lock `so1'. In such cases the lock of `so1' also protects `so2' from being killed by concurrent thread. To prevent this we get extra reference count on `so2' before re-lock `so1' and release it after. Also `so1' could change it's state while being unlocked so we need to check it. This introduces re-lock dances not only in PCB layer but in sockets layer too. In the sockets layer PF_INET and PF_INET6 sockets use the same code path with PF_UNIX sockets. The solock_persocket() function introduced to determine which locking strategy should be used. Please test the diff and give me feedback. I'm also interesting in the positive experience so feel free to report. Index: sys/kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.269 diff -u -p -r1.269 uipc_socket.c --- sys/kern/uipc_socket.c 11 Nov 2021 16:35:09 -0000 1.269 +++ sys/kern/uipc_socket.c 18 Nov 2021 10:49:43 -0000 @@ -52,6 +52,7 @@ #include <sys/atomic.h> #include <sys/rwlock.h> #include <sys/time.h> +#include <sys/refcnt.h> #ifdef DDB #include <machine/db_machdep.h> @@ -156,7 +157,9 @@ soalloc(int prflags) so = pool_get(&socket_pool, prflags); if (so == NULL) return (NULL); - rw_init(&so->so_lock, "solock"); + rw_init_flags(&so->so_lock, "solock", RWL_DUPOK); + refcnt_init(&so->so_refcnt); + return (so); } @@ -257,6 +260,8 @@ solisten(struct socket *so, int backlog) void sofree(struct socket *so, int s) { + int persocket = solock_persocket(so); + soassertlocked(so); if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { @@ -264,16 +269,57 @@ sofree(struct socket *so, int s) return; } if (so->so_head) { + struct socket *head = so->so_head; + /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ - if (!soqremque(so, 0)) { + if (so->so_onq == &head->so_q) { sounlock(so, s); return; } + + if (persocket) { + if (so < head) + solock(head); + else { + /* + * Concurrent close of `head' could + * abort `so' due to re-lock. + */ + soref(so); + soref(head); + sounlock(so, SL_LOCKED); + solock(head); + solock(so); + + if (so->so_onq != &head->so_q0) { + sounlock(head, SL_LOCKED); + sounlock(so, SL_LOCKED); + sorele(head); + sorele(so); + return; + } + + sorele(head); + sorele(so); + } + } + + soqremque(so, 0); + + if (persocket) + sounlock(head, SL_LOCKED); } + + if (persocket) { + sounlock(so, SL_LOCKED); + refcnt_finalize(&so->so_refcnt, "sofinal"); + solock(so); + } + sigio_free(&so->so_sigio); klist_free(&so->so_rcv.sb_sel.si_note); klist_free(&so->so_snd.sb_sel.si_note); @@ -363,13 +409,68 @@ drop: error = error2; } if (so->so_options & SO_ACCEPTCONN) { + int persocket = solock_persocket(so); + + if (persocket) { + /* Wait concurrent sonewconn() threads. */ + while (so->so_newconn > 0) { + so->so_state |= SS_NEWCONN_WAIT; + sosleep_nsec(so, &so->so_newconn, PSOCK, + "netlck", INFSLP); + } + } + while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { + if (persocket) { + if (so < so2) + solock(so2); + else { + soref(so2); + sounlock(so, SL_LOCKED); + solock(so2); + solock(so); + + if (so2->so_onq != &so->so_q0) { + sounlock(so2, SL_LOCKED); + sorele(so2); + continue; + } + + sorele(so2); + } + } (void) soqremque(so2, 0); + if (persocket) + sounlock(so, SL_LOCKED); (void) soabort(so2); + if (persocket) + solock(so); } while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { + if (persocket) { + if (so < so2) + solock(so2); + else { + soref(so2); + sounlock(so, SL_LOCKED); + solock(so2); + solock(so); + + if (so2->so_onq != &so->so_q) { + sounlock(so2, SL_LOCKED); + sorele(so2); + continue; + } + + sorele(so2); + } + } (void) soqremque(so2, 1); + if (persocket) + sounlock(so, SL_LOCKED); (void) soabort(so2); + if (persocket) + solock(so); } } discard: @@ -437,11 +538,19 @@ soconnect(struct socket *so, struct mbuf int soconnect2(struct socket *so1, struct socket *so2) { - int s, error; + int persocket, s, error; + + if ((persocket = solock_persocket(so1))) { + solock_pair(so1, so2); + s = SL_LOCKED; + } else + s = solock(so1); - s = solock(so1); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, (struct mbuf *)so2, NULL, curproc); + + if (persocket) + sounlock(so2, s); sounlock(so1, s); return (error); } Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.116 diff -u -p -r1.116 uipc_socket2.c --- sys/kern/uipc_socket2.c 6 Nov 2021 05:26:33 -0000 1.116 +++ sys/kern/uipc_socket2.c 18 Nov 2021 10:49:43 -0000 @@ -53,8 +53,6 @@ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; extern struct pool mbpool; -extern struct rwlock unp_lock; - /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the @@ -101,10 +99,42 @@ soisconnected(struct socket *so) soassertlocked(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; - if (head && soqremque(so, 0)) { + + if (head != NULL && so->so_onq == &head->so_q0) { + int persocket = solock_persocket(so); + + if (persocket) { + if (so < head) + solock(head); + else { + soref(so); + soref(head); + + sounlock(so, SL_LOCKED); + solock(head); + solock(so); + + if (so->so_onq != &head->so_q0) { + sounlock(head, SL_LOCKED); + sounlock(so, SL_LOCKED); + sorele(head); + sorele(so); + + return; + } + + sorele(head); + sorele(so); + } + } + + soqremque(so, 0); soqinsque(head, so, 1); sorwakeup(head); wakeup_one(&head->so_timeo); + + if (persocket) + sounlock(head, SL_LOCKED); } else { wakeup(&so->so_timeo); sorwakeup(so); @@ -146,7 +176,8 @@ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; - int soqueue = connstatus ? 1 : 0; + int persocket = solock_persocket(head); + int error; /* * XXXSMP as long as `so' and `head' share the same lock, we @@ -174,10 +205,20 @@ sonewconn(struct socket *head, int conns so->so_rgid = head->so_rgid; so->so_cpid = head->so_cpid; + if (persocket) { + /* + * Lock order doesn't matter. We are the only thread + * which simultaneously locks these sockets. + */ + solock(so); + } + /* * Inherit watermarks but those may get clamped in low mem situations. */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (persocket) + sounlock(so, SL_LOCKED); pool_put(&socket_pool, so); return (NULL); } @@ -193,20 +234,56 @@ sonewconn(struct socket *head, int conns sigio_init(&so->so_sigio); sigio_copy(&so->so_sigio, &head->so_sigio); - soqinsque(head, so, soqueue); - if ((*so->so_proto->pr_attach)(so, 0)) { - (void) soqremque(so, soqueue); + soqinsque(head, so, 0); + + /* + * We need to unlock `head' because PCB layer could release + * solock() to enforce desired lock order. + */ + if (persocket) { + head->so_newconn++; + sounlock(head, SL_LOCKED); + } + + error = (*so->so_proto->pr_attach)(so, 0); + + /* + * The lock order matters from here. + */ + if (persocket) { + sounlock(so, SL_LOCKED); + solock_pair(head, so); + + if ((head->so_newconn--) == 0) { + if ((head->so_state & SS_NEWCONN_WAIT) != 0) { + head->so_state &= ~SS_NEWCONN_WAIT; + wakeup(&head->so_newconn); + } + } + } + + if (error) { + soqremque(so, 0); + if (persocket) + sounlock(so, SL_LOCKED); sigio_free(&so->so_sigio); klist_free(&so->so_rcv.sb_sel.si_note); klist_free(&so->so_snd.sb_sel.si_note); pool_put(&socket_pool, so); return (NULL); } + if (connstatus) { + so->so_state |= connstatus; + soqremque(so, 0); + soqinsque(head, so, 1); sorwakeup(head); wakeup(&head->so_timeo); - so->so_state |= connstatus; } + + if (persocket) + sounlock(so, SL_LOCKED); + return (so); } @@ -214,6 +291,7 @@ void soqinsque(struct socket *head, struct socket *so, int q) { soassertlocked(head); + soassertlocked(so); KASSERT(so->so_onq == NULL); @@ -233,6 +311,7 @@ soqremque(struct socket *so, int q) { struct socket *head = so->so_head; + soassertlocked(so); soassertlocked(head); if (q == 0) { @@ -284,9 +363,6 @@ solock(struct socket *so) case PF_INET6: NET_LOCK(); break; - case PF_UNIX: - rw_enter_write(&unp_lock); - break; default: rw_enter_write(&so->so_lock); break; @@ -295,6 +371,34 @@ solock(struct socket *so) return (SL_LOCKED); } +int +solock_persocket(struct socket *so) +{ + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + return 0; + default: + return 1; + } +} + +void +solock_pair(struct socket *so1, struct socket *so2) +{ + KASSERT(so1 != so2); + KASSERT(so1->so_type == so2->so_type); + KASSERT(solock_persocket(so1)); + + if (so1 < so2) { + solock(so1); + solock(so2); + } else { + solock(so2); + solock(so1); + } +} + void sounlock(struct socket *so, int s) { @@ -308,9 +412,6 @@ sounlock(struct socket *so, int s) case PF_INET6: NET_UNLOCK(); break; - case PF_UNIX: - rw_exit_write(&unp_lock); - break; default: rw_exit_write(&so->so_lock); break; @@ -325,9 +426,6 @@ soassertlocked(struct socket *so) case PF_INET6: NET_ASSERT_LOCKED(); break; - case PF_UNIX: - rw_assert_wrlock(&unp_lock); - break; default: rw_assert_wrlock(&so->so_lock); break; @@ -344,9 +442,6 @@ sosleep_nsec(struct socket *so, void *id case PF_INET: case PF_INET6: ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - break; - case PF_UNIX: - ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); break; default: ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); Index: sys/kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.194 diff -u -p -r1.194 uipc_syscalls.c --- sys/kern/uipc_syscalls.c 24 Oct 2021 00:02:25 -0000 1.194 +++ sys/kern/uipc_syscalls.c 18 Nov 2021 10:49:43 -0000 @@ -246,7 +246,7 @@ doaccept(struct proc *p, int sock, struc socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; - int cloexec, nflag; + int cloexec, nflag, persocket; cloexec = (flags & SOCK_CLOEXEC) ? UF_EXCLOSE : 0; @@ -269,16 +269,20 @@ doaccept(struct proc *p, int sock, struc head = headfp->f_data; s = solock(head); + + persocket = solock_persocket(head); + if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; - goto out; + goto out_unlock; } +retry: if ((headfp->f_flag & FNONBLOCK) && head->so_qlen == 0) { if (head->so_state & SS_CANTRCVMORE) error = ECONNABORTED; else error = EWOULDBLOCK; - goto out; + goto out_unlock; } while (head->so_qlen == 0 && head->so_error == 0) { if (head->so_state & SS_CANTRCVMORE) { @@ -288,18 +292,40 @@ doaccept(struct proc *p, int sock, struc error = sosleep_nsec(head, &head->so_timeo, PSOCK | PCATCH, "netcon", INFSLP); if (error) - goto out; + goto out_unlock; } if (head->so_error) { error = head->so_error; head->so_error = 0; - goto out; + goto out_unlock; } /* * Do not sleep after we have taken the socket out of the queue. */ - so = TAILQ_FIRST(&head->so_q); + + if ((so = TAILQ_FIRST(&head->so_q)) == NULL) + panic("accept"); + + if (persocket) { + if (head < so) + solock(so); + else { + soref(so); + sounlock(head, SL_LOCKED); + solock(so); + solock(head); + + if (so->so_head != head) { + sounlock(so, SL_LOCKED); + sorele(so); + goto retry; + } + + sorele(so); + } + } + if (soqremque(so, 1) == 0) panic("accept"); @@ -310,30 +336,53 @@ doaccept(struct proc *p, int sock, struc /* connection has been removed from the listen queue */ KNOTE(&head->so_rcv.sb_sel.si_note, 0); + if (persocket) + sounlock(head, s); + fp->f_type = DTYPE_SOCKET; fp->f_flag = FREAD | FWRITE | nflag; fp->f_ops = &socketops; fp->f_data = so; + error = soaccept(so, nam); -out: - sounlock(head, s); - if (!error && name != NULL) + + /* + * It doesn't matter which socket to unlock when we + * locked the whole layer. + */ + sounlock(so, s); + + if (error) + goto out; + + if (name != NULL) { error = copyaddrout(p, nam, name, namelen, anamelen); - if (!error) { - fdplock(fdp); - fdinsert(fdp, tmpfd, cloexec, fp); - fdpunlock(fdp); - FRELE(fp, p); - *retval = tmpfd; - } else { - fdplock(fdp); - fdremove(fdp, tmpfd); - fdpunlock(fdp); - closef(fp, p); + if (error) + goto out; } + fdplock(fdp); + fdinsert(fdp, tmpfd, cloexec, fp); + fdpunlock(fdp); + FRELE(fp, p); + *retval = tmpfd; + m_freem(nam); FRELE(headfp, p); + + return 0; + +out_unlock: + sounlock(head, s); +out: + fdplock(fdp); + fdremove(fdp, tmpfd); + fdpunlock(fdp); + closef(fp, p); + + m_freem(nam); + FRELE(headfp, p); + return (error); } Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.158 diff -u -p -r1.158 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 17 Nov 2021 22:56:19 -0000 1.158 +++ sys/kern/uipc_usrreq.c 18 Nov 2021 10:49:43 -0000 @@ -52,7 +52,6 @@ #include <sys/pledge.h> #include <sys/pool.h> #include <sys/rwlock.h> -#include <sys/mutex.h> #include <sys/sysctl.h> #include <sys/lock.h> @@ -61,16 +60,17 @@ * I immutable after creation * D unp_df_lock * G unp_gc_lock - * U unp_lock + * M unp_ino_mtx * R unp_rights_mtx * a atomic + * s socket lock */ -struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock"); struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk"); struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk"); struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); +struct mutex unp_ino_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); /* * Stack of sets of files that were passed over a socket but were @@ -88,6 +88,7 @@ void unp_discard(struct fdpass *, int); void unp_mark(struct fdpass *, int); void unp_scan(struct mbuf *, void (*)(struct fdpass *, int)); int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *); +int unp_solock_peer(struct socket *, struct socket **); struct pool unpcb_pool; struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); @@ -121,6 +122,54 @@ unp_init(void) IPL_SOFTNET, 0, "unpcb", NULL); } +int +unp_solock_peer(struct socket *so, struct socket **ret) +{ + struct unpcb *unp; + struct socket *so2; + + unp = so->so_pcb; + + if (unp->unp_conn == NULL) + return ENOTCONN; + + so2 = unp->unp_conn->unp_socket; + + if (so < so2) + solock(so2); + else if (so > so2){ + struct unpcb *unp2; + + soref(so2); + sounlock(so, SL_LOCKED); + + solock(so2); + solock(so); + + unp2 = so2->so_pcb; + + /* + * The peer could be closed due to re-lock. In the case + * `so' is the datagram socket it could be connected + * again by concurrent PRU_CONNECT thread to another + * datagram socket. This socket could be newly created + * and have `so_pcb' with the address our previous peer + * had. + */ + if (unp->unp_conn != unp2 || unp2 == NULL) { + sounlock(so2, SL_LOCKED); + sorele(so2); + return EPIPE; + } + + sorele(so2); + } + + *ret = so2; + + return 0; +} + void uipc_setaddr(const struct unpcb *unp, struct mbuf *nam) { @@ -195,6 +244,12 @@ uipc_usrreq(struct socket *so, int req, * if it was bound and we are still connected * (our peer may have closed already!). */ + /* + * Don't need to lock `unp_conn'. The `unp_addr' is + * immutable since we set it within unp_connect(). + * Both sockets are locked while we connecting them + * so it's enough to hold lock on `unp'. + */ uipc_setaddr(unp->unp_conn, nam); break; @@ -212,9 +267,8 @@ uipc_usrreq(struct socket *so, int req, case SOCK_STREAM: case SOCK_SEQPACKET: - if (unp->unp_conn == NULL) + if ((unp_solock_peer(so, &so2)) != 0) break; - so2 = unp->unp_conn->unp_socket; /* * Adjust backpressure on sender * and wakeup any waiting to write. @@ -222,6 +276,7 @@ uipc_usrreq(struct socket *so, int req, so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; so2->so_snd.sb_cc = so->so_rcv.sb_cc; sowwakeup(so2); + sounlock(so2, SL_LOCKED); break; default: @@ -250,13 +305,14 @@ uipc_usrreq(struct socket *so, int req, error = unp_connect(so, nam, p); if (error) break; - } else { - if (unp->unp_conn == NULL) { - error = ENOTCONN; - break; - } } - so2 = unp->unp_conn->unp_socket; + + if ((error = unp_solock_peer(so, &so2)) != 0) { + if (nam != NULL) + error = ECONNREFUSED; + break; + } + if (unp->unp_addr) from = mtod(unp->unp_addr, struct sockaddr *); else @@ -267,6 +323,10 @@ uipc_usrreq(struct socket *so, int req, control = NULL; } else error = ENOBUFS; + + if (so2 != so) + sounlock(so2, SL_LOCKED); + if (nam) unp_disconnect(unp); break; @@ -278,11 +338,10 @@ uipc_usrreq(struct socket *so, int req, error = EPIPE; break; } - if (unp->unp_conn == NULL) { - error = ENOTCONN; + if ((error = unp_solock_peer(so, &so2)) != 0) { break; } - so2 = unp->unp_conn->unp_socket; + /* * Send to paired receive port, and then raise * send buffer counts to maintain backpressure. @@ -304,6 +363,8 @@ uipc_usrreq(struct socket *so, int req, so->so_snd.sb_cc = so2->so_rcv.sb_cc; if (so2->so_rcv.sb_cc > 0) sorwakeup(so2); + + sounlock(so2, SL_LOCKED); m = NULL; break; @@ -317,12 +378,7 @@ uipc_usrreq(struct socket *so, int req, case PRU_ABORT: unp_detach(unp); - /* - * As long as `unp_lock' is taken before entering - * uipc_usrreq() releasing it here would lead to a - * double unlock. - */ - sofree(so, SL_NOUNLOCK); + sofree(so, SL_LOCKED); break; case PRU_SENSE: { @@ -330,8 +386,10 @@ uipc_usrreq(struct socket *so, int req, sb->st_blksize = so->so_snd.sb_hiwat; sb->st_dev = NODEV; + mtx_enter(&unp_ino_mtx); if (unp->unp_ino == 0) unp->unp_ino = unp_ino++; + mtx_leave(&unp_ino_mtx); sb->st_atim.tv_sec = sb->st_mtim.tv_sec = sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec; @@ -352,6 +410,12 @@ uipc_usrreq(struct socket *so, int req, break; case PRU_PEERADDR: + /* + * Don't need to lock `unp_conn'. The `unp_addr' is + * immutable since we set it within unp_connect(). + * Both sockets are locked while we connecting them + * so it's enough to hold lock on `unp'. + */ uipc_setaddr(unp->unp_conn, nam); break; @@ -404,8 +468,6 @@ uipc_attach(struct socket *so, int proto struct unpcb *unp; int error; - rw_assert_wrlock(&unp_lock); - if (so->so_pcb) return EISCONN; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -439,12 +501,6 @@ uipc_attach(struct socket *so, int proto /* * Enforce `unp_gc_lock' -> `solock()' lock order. */ - /* - * We also release the lock on listening socket and on our peer - * socket when called from unp_connect(). This is safe. The - * listening socket protected by vnode(9) lock. The peer socket - * has 'UNP_CONNECTING' flag set. - */ sounlock(so, SL_LOCKED); rw_enter_write(&unp_gc_lock); LIST_INSERT_HEAD(&unp_head, unp, unp_link); @@ -506,14 +562,13 @@ unp_detach(struct unpcb *unp) { struct socket *so = unp->unp_socket; struct vnode *vp = unp->unp_vnode; - - rw_assert_wrlock(&unp_lock); + struct unpcb *unp2; unp->unp_vnode = NULL; /* * Enforce `unp_gc_lock' -> `solock()' lock order. - * Enforce `i_lock' -> `unp_lock' lock order. + * Enforce `i_lock' -> `solock()' lock order. */ sounlock(so, SL_LOCKED); @@ -532,10 +587,40 @@ unp_detach(struct unpcb *unp) solock(so); - if (unp->unp_conn) + if (unp->unp_conn != NULL) { + /* + * Datagram socket could be connected to itself. + * Such socket will be disconnected here. + */ unp_disconnect(unp); - while (!SLIST_EMPTY(&unp->unp_refs)) - unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET); + } + + while ((unp2 = SLIST_FIRST(&unp->unp_refs)) != NULL) { + struct socket *so2 = unp2->unp_socket; + + soref(so2); + sounlock(so, SL_LOCKED); + solock(so2); + + /* + * soref() doesn't protect `so_pcb'. `unp2' + * derefernce is not safe after sounlock(). + */ + unp2 = so2->so_pcb; + + /* + * The datagram socket could be closed. Also it + * could be disconnected from us and connected + * again to another socket. + */ + if (unp2 != NULL && unp2->unp_conn == unp) + unp_drop(unp2, ECONNRESET); + + sounlock(so2, SL_LOCKED); + sorele(so2); + solock(so); + } + soisdisconnected(so); so->so_pcb = NULL; m_freem(unp->unp_addr); @@ -675,24 +760,42 @@ unp_connect(struct socket *so, struct mb } if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0) goto put; - solock(so); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; - goto put_locked; + goto put; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; - goto put_locked; + goto put; } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + solock(so2); + if ((so2->so_options & SO_ACCEPTCONN) == 0 || (so3 = sonewconn(so2, 0)) == NULL) { error = ECONNREFUSED; - goto put_locked; } + + sounlock(so2, SL_LOCKED); + + if (error != 0) + goto put; + + /* + * Since `so2' is protected by vnode(9) lock, `so3' + * can't be PRU_ABORT'ed here. + */ + solock_pair(so, so3); + unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); + + /* + * `unp_addr', `unp_connid' and 'UNP_FEIDSBIND' flag + * are immutable since we set them in unp_bind(). + */ if (unp2->unp_addr) unp3->unp_addr = m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT); @@ -700,15 +803,29 @@ unp_connect(struct socket *so, struct mb unp3->unp_connid.gid = p->p_ucred->cr_gid; unp3->unp_connid.pid = p->p_p->ps_pid; unp3->unp_flags |= UNP_FEIDS; - so2 = so3; + if (unp2->unp_flags & UNP_FEIDSBIND) { unp->unp_connid = unp2->unp_connid; unp->unp_flags |= UNP_FEIDS; } + + so2 = so3; + } else { + if (so2 != so) + solock_pair(so, so2); + else + solock(so); } + error = unp_connect2(so, so2); -put_locked: + sounlock(so, SL_LOCKED); + + /* + * `so2' can't be PRU_ABORT'ed concurrently + */ + if (so2 != so) + sounlock(so2, SL_LOCKED); put: vput(vp); unlock: @@ -732,7 +849,8 @@ unp_connect2(struct socket *so, struct s struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; - rw_assert_wrlock(&unp_lock); + soassertlocked(so); + soassertlocked(so2); if (so2->so_type != so->so_type) return (EPROTOTYPE); @@ -747,6 +865,13 @@ unp_connect2(struct socket *so, struct s case SOCK_STREAM: case SOCK_SEQPACKET: + /* + * soisconnected() could re-lock `so2' and will lock + * 'so2->so_head'. `so2' is not yet exposed to userland + * and we can't have concurrent thread which locks `so2' + * and then `so'. Also we can't have concurrent thread + * which simultaneously locks `so' and 'so2->so_head'. + */ unp2->unp_conn = unp; soisconnected(so); soisconnected(so2); @@ -761,11 +886,15 @@ unp_connect2(struct socket *so, struct s void unp_disconnect(struct unpcb *unp) { - struct unpcb *unp2 = unp->unp_conn; + struct socket *so2; + struct unpcb *unp2; - if (unp2 == NULL) + if ((unp_solock_peer(unp->unp_socket, &so2)) != 0) return; + + unp2 = unp->unp_conn; unp->unp_conn = NULL; + switch (unp->unp_socket->so_type) { case SOCK_DGRAM: @@ -784,18 +913,23 @@ unp_disconnect(struct unpcb *unp) soisdisconnected(unp2->unp_socket); break; } + + if (so2 != unp->unp_socket) + sounlock(so2, SL_LOCKED); } void unp_shutdown(struct unpcb *unp) { - struct socket *so; + struct socket *so2; switch (unp->unp_socket->so_type) { case SOCK_STREAM: case SOCK_SEQPACKET: - if (unp->unp_conn && (so = unp->unp_conn->unp_socket)) - socantrcvmore(so); + if ((unp_solock_peer(unp->unp_socket, &so2)) == 0) { + socantrcvmore(so2); + sounlock(so2, SL_LOCKED); + } break; default: break; @@ -807,7 +941,7 @@ unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; - rw_assert_wrlock(&unp_lock); + soassertlocked(so); so->so_error = errno; unp_disconnect(unp); Index: sys/miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v retrieving revision 1.85 diff -u -p -r1.85 fifo_vnops.c --- sys/miscfs/fifofs/fifo_vnops.c 24 Oct 2021 11:23:22 -0000 1.85 +++ sys/miscfs/fifofs/fifo_vnops.c 18 Nov 2021 10:49:43 -0000 @@ -156,7 +156,7 @@ fifo_open(void *v) struct vnode *vp = ap->a_vp; struct fifoinfo *fip; struct socket *rso, *wso; - int s, error; + int error; if ((fip = vp->v_fifoinfo) == NULL) { fip = malloc(sizeof(*fip), M_VNODE, M_WAITOK); @@ -182,18 +182,20 @@ fifo_open(void *v) return (error); } fip->fi_readers = fip->fi_writers = 0; - s = solock(wso); + solock(wso); wso->so_state |= SS_CANTSENDMORE; wso->so_snd.sb_lowat = PIPE_BUF; + sounlock(wso, SL_LOCKED); } else { rso = fip->fi_readsock; wso = fip->fi_writesock; - s = solock(wso); } if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { + solock(wso); wso->so_state &= ~SS_CANTSENDMORE; + sounlock(wso, SL_LOCKED); if (fip->fi_writers > 0) wakeup(&fip->fi_writers); } @@ -202,16 +204,16 @@ fifo_open(void *v) fip->fi_writers++; if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) { error = ENXIO; - sounlock(wso, s); goto bad; } if (fip->fi_writers == 1) { + solock(rso); rso->so_state &= ~(SS_CANTRCVMORE|SS_ISDISCONNECTED); + sounlock(rso, SL_LOCKED); if (fip->fi_readers > 0) wakeup(&fip->fi_readers); } } - sounlock(wso, s); if ((ap->a_mode & O_NONBLOCK) == 0) { if ((ap->a_mode & FREAD) && fip->fi_writers == 0) { VOP_UNLOCK(vp); @@ -327,17 +329,16 @@ fifo_poll(void *v) struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; int events = 0; int revents = 0; - int s; /* * FIFOs don't support out-of-band or high priority data. */ - s = solock(rso); if (ap->a_fflag & FREAD) events |= ap->a_events & (POLLIN | POLLRDNORM); if (ap->a_fflag & FWRITE) events |= ap->a_events & (POLLOUT | POLLWRNORM); + solock_pair(rso, wso); if (events & (POLLIN | POLLRDNORM)) { if (soreadable(rso)) revents |= events & (POLLIN | POLLRDNORM); @@ -362,7 +363,8 @@ fifo_poll(void *v) wso->so_snd.sb_flags |= SB_SEL; } } - sounlock(rso, s); + sounlock(rso, SL_LOCKED); + sounlock(wso, SL_LOCKED); return (revents); } Index: sys/sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.101 diff -u -p -r1.101 socketvar.h --- sys/sys/socketvar.h 6 Nov 2021 05:26:33 -0000 1.101 +++ sys/sys/socketvar.h 18 Nov 2021 10:49:43 -0000 @@ -38,6 +38,7 @@ #include <sys/task.h> #include <sys/timeout.h> #include <sys/rwlock.h> +#include <sys/refcnt.h> #ifndef _SOCKLEN_T_DEFINED_ #define _SOCKLEN_T_DEFINED_ @@ -55,6 +56,7 @@ TAILQ_HEAD(soqhead, socket); struct socket { const struct protosw *so_proto; /* protocol handle */ struct rwlock so_lock; /* this socket lock */ + struct refcnt so_refcnt; /* references to this socket */ void *so_pcb; /* protocol control block */ u_int so_state; /* internal state flags SS_*, below */ short so_type; /* generic type, see socket.h */ @@ -80,6 +82,7 @@ struct socket { short so_q0len; /* partials on so_q0 */ short so_qlen; /* number of connections on so_q */ short so_qlimit; /* max number queued connections */ + u_long so_newconn; /* # of pending sonewconn() threads */ short so_timeo; /* connection timeout */ u_long so_oobmark; /* chars to oob mark */ u_int so_error; /* error affecting connection */ @@ -150,6 +153,7 @@ struct socket { #define SS_CONNECTOUT 0x1000 /* connect, not accept, at this end */ #define SS_ISSENDING 0x2000 /* hint for lower layer */ #define SS_DNS 0x4000 /* created using SOCK_DNS socket(2) */ +#define SS_NEWCONN_WAIT 0x8000 /* waiting sonewconn() relock */ #ifdef _KERNEL @@ -163,6 +167,18 @@ struct socket { void soassertlocked(struct socket *); +static inline void +soref(struct socket *so) +{ + refcnt_take(&so->so_refcnt); +} + +static inline void +sorele(struct socket *so) +{ + refcnt_rele_wake(&so->so_refcnt); +} + /* * Macros for sockets and socket buffering. */ @@ -337,6 +353,8 @@ int sockargs(struct mbuf **, const void int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t); int solock(struct socket *); +int solock_persocket(struct socket *); +void solock_pair(struct socket *, struct socket *); void sounlock(struct socket *, int); int sendit(struct proc *, int, struct msghdr *, int, register_t *); Index: sys/sys/unpcb.h =================================================================== RCS file: /cvs/src/sys/sys/unpcb.h,v retrieving revision 1.20 diff -u -p -r1.20 unpcb.h --- sys/sys/unpcb.h 16 Nov 2021 08:56:20 -0000 1.20 +++ sys/sys/unpcb.h 18 Nov 2021 10:49:43 -0000 @@ -60,24 +60,25 @@ * Locks used to protect struct members: * I immutable after creation * G unp_gc_lock - * U unp_lock * a atomic + * s socket lock */ struct unpcb { struct socket *unp_socket; /* [I] pointer back to socket */ - struct vnode *unp_vnode; /* [U] if associated with file */ + struct vnode *unp_vnode; /* [s] if associated with file */ struct file *unp_file; /* [a] backpointer for unp_gc() */ - struct unpcb *unp_conn; /* [U] control block of connected socket */ - ino_t unp_ino; /* [U] fake inode number */ - SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */ - SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */ - struct mbuf *unp_addr; /* [U] bound address of socket */ + struct unpcb *unp_conn; /* [s] control block of connected + socket */ + ino_t unp_ino; /* [s] fake inode number */ + SLIST_HEAD(,unpcb) unp_refs; /* [s] referencing socket linked list */ + SLIST_ENTRY(unpcb) unp_nextref; /* [s] link in unp_refs list */ + struct mbuf *unp_addr; /* [s] bound address of socket */ long unp_msgcount; /* [a] references from socket rcv buf */ - int unp_flags; /* [U] this unpcb contains peer eids */ - int unp_gcflags; /* [G] garbge collector flags */ - struct sockpeercred unp_connid;/* [U] id of peer process */ + int unp_flags; /* [s] this unpcb contains peer eids */ + int unp_gcflags; /* [G] garbage collector flags */ + struct sockpeercred unp_connid;/* [s] id of peer process */ struct timespec unp_ctime; /* [I] holds creation time */ LIST_ENTRY(unpcb) unp_link; /* [G] link in per-AF list of sockets */ };