On 30/10/21(Sat) 21:22, Vitaliy Makkoveev wrote:
> This completely removes global rwlock(9) from the unp_internalize() and
> unp_externalize() normal paths but only leaves it in unp_externalize()
> error path. Also we don't need to simultaneously hold both fdplock()
> and `unp_lock' in unp_internalize(). As non obvious profit this
> simplifies the future lock dances in the UNIX sockets layer.
>
> It's safe to call fptounp() without `unp_lock' held. We always got this
> file descriptor by fd_getfile(9) so we always have the extra reference
> and this descriptor can't be closed by concurrent thread. Some sockets
> could be destroyed through 'PRU_ABORT' path but they don't have
> associated file descriptor and they are not accessible in the
> unp_internalize() path.
>
> The `unp_file' access without `unp_lock' held is also safe. Each socket
> could have the only associated file descriptor and each file descriptor
> could have the only associated socket. We only assign `unp_file' in the
> unp_internalize() path where we got the socket by fd_getfile(9). This
> descriptor has the extra reference and couldn't be closed concurrently.
> We could override `unp_file' but with the same address because the
> associated file descriptor can't be changed so the address will be also
> the same. So while unp_gc() concurrently runs the dereference of
> non-NULL `unp_file' is always safe.
Using an atomic operation for `unp_msgcount' is ok with me, one comment
about `unp_rights' below.
> Index: sys/kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.153
> diff -u -p -r1.153 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c 30 Oct 2021 16:35:31 -0000 1.153
> +++ sys/kern/uipc_usrreq.c 30 Oct 2021 18:41:25 -0000
> @@ -58,6 +58,7 @@
> * Locks used to protect global data and struct members:
> * I immutable after creation
> * U unp_lock
> + * a atomic
> */
> struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
>
> @@ -99,7 +100,7 @@ SLIST_HEAD(,unp_deferral) unp_deferred =
> SLIST_HEAD_INITIALIZER(unp_deferred);
>
> ino_t unp_ino; /* [U] prototype for fake inode numbers */
> -int unp_rights; /* [U] file descriptors in flight */
> +int unp_rights; /* [a] file descriptors in flight */
> int unp_defer; /* [U] number of deferred fp to close by the GC task */
> int unp_gcing; /* [U] GC task currently running */
>
> @@ -927,17 +928,16 @@ restart:
> */
> rp = (struct fdpass *)CMSG_DATA(cm);
>
> - rw_enter_write(&unp_lock);
> for (i = 0; i < nfds; i++) {
> struct unpcb *unp;
>
> fp = rp->fp;
> rp++;
> if ((unp = fptounp(fp)) != NULL)
> - unp->unp_msgcount--;
> - unp_rights--;
> + atomic_dec_long(&unp->unp_msgcount);
> }
> - rw_exit_write(&unp_lock);
> +
> + atomic_sub_int(&unp_rights, nfds);
>
> /*
> * Copy temporary array to message and adjust length, in case of
> @@ -985,13 +985,10 @@ unp_internalize(struct mbuf *control, st
> return (EINVAL);
> nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
>
> - rw_enter_write(&unp_lock);
> - if (unp_rights + nfds > maxfiles / 10) {
> - rw_exit_write(&unp_lock);
> + if (atomic_add_int_nv(&unp_rights, nfds) > maxfiles / 10) {
> + atomic_sub_int(&unp_rights, nfds);
I can't believe this is race free. If two threads, T1 and T2, call
atomic_add at the same time both might end up returning EMFILE even
if only the first one currently does. This could happen if T1 exceeds
the limit and T2 does atomic_add on an already-exceeded `unp_rights'
before T1 could do atomic_sub.
I suggest using a mutex to protect `unp_rights' instead to solve this
issue.
> return (EMFILE);
> }
> - unp_rights += nfds;
> - rw_exit_write(&unp_lock);
>
> /* Make sure we have room for the struct file pointers */
> morespace:
> @@ -1031,7 +1028,6 @@ morespace:
> ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
> rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
> fdplock(fdp);
> - rw_enter_write(&unp_lock);
> for (i = 0; i < nfds; i++) {
> memcpy(&fd, ip, sizeof fd);
> ip--;
> @@ -1056,15 +1052,13 @@ morespace:
> rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
> rp--;
> if ((unp = fptounp(fp)) != NULL) {
> + atomic_inc_long(&unp->unp_msgcount);
> unp->unp_file = fp;
> - unp->unp_msgcount++;
> }
> }
> - rw_exit_write(&unp_lock);
> fdpunlock(fdp);
> return (0);
> fail:
> - rw_exit_write(&unp_lock);
> fdpunlock(fdp);
> if (fp != NULL)
> FRELE(fp, p);
> @@ -1072,17 +1066,13 @@ fail:
> for ( ; i > 0; i--) {
> rp++;
> fp = rp->fp;
> - rw_enter_write(&unp_lock);
> if ((unp = fptounp(fp)) != NULL)
> - unp->unp_msgcount--;
> - rw_exit_write(&unp_lock);
> + atomic_dec_long(&unp->unp_msgcount);
> FRELE(fp, p);
> }
>
> nospace:
> - rw_enter_write(&unp_lock);
> - unp_rights -= nfds;
> - rw_exit_write(&unp_lock);
> + atomic_sub_int(&unp_rights, nfds);
>
> return (error);
> }
> @@ -1105,21 +1095,21 @@ unp_gc(void *arg __unused)
> /* close any fds on the deferred list */
> while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
> SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
> + rw_exit_write(&unp_lock);
> for (i = 0; i < defer->ud_n; i++) {
> fp = defer->ud_fp[i].fp;
> if (fp == NULL)
> continue;
> + if ((unp = fptounp(fp)) != NULL)
> + atomic_dec_long(&unp->unp_msgcount);
> + atomic_dec_int(&unp_rights);
> /* closef() expects a refcount of 2 */
> FREF(fp);
> - if ((unp = fptounp(fp)) != NULL)
> - unp->unp_msgcount--;
> - unp_rights--;
> - rw_exit_write(&unp_lock);
> (void) closef(fp, NULL);
> - rw_enter_write(&unp_lock);
> }
> free(defer, M_TEMP, sizeof(*defer) +
> sizeof(struct fdpass) * defer->ud_n);
> + rw_enter_write(&unp_lock);
> }
>
> unp_defer = 0;
> Index: sys/sys/unpcb.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/unpcb.h,v
> retrieving revision 1.18
> diff -u -p -r1.18 unpcb.h
> --- sys/sys/unpcb.h 10 Feb 2021 08:20:09 -0000 1.18
> +++ sys/sys/unpcb.h 30 Oct 2021 18:41:25 -0000
> @@ -60,19 +60,20 @@
> * Locks used to protect struct members:
> * I immutable after creation
> * U unp_lock
> + * a atomic
> */
>
>
> struct unpcb {
> struct socket *unp_socket; /* [I] pointer back to socket */
> struct vnode *unp_vnode; /* [U] if associated with file */
> - struct file *unp_file; /* [U] backpointer for unp_gc() */
> + struct file *unp_file; /* [a] backpointer for unp_gc() */
> struct unpcb *unp_conn; /* [U] control block of connected
> socket */
> ino_t unp_ino; /* [U] fake inode number */
> SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */
> SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
> struct mbuf *unp_addr; /* [U] bound address of socket */
> - long unp_msgcount; /* [U] references from socket rcv buf */
> + long unp_msgcount; /* [a] references from socket rcv buf */
> int unp_flags; /* [U] this unpcb contains peer eids */
> struct sockpeercred unp_connid;/* [U] id of peer process */
> struct timespec unp_ctime; /* [I] holds creation time */
>