> Date: Fri, 5 Nov 2021 07:18:03 +0100
> From: Martin Pieuchot <[email protected]>
> 
> On 30/10/21(Sat) 21:22, Vitaliy Makkoveev wrote:
> > This completely removes global rwlock(9) from the unp_internalize() and
> > unp_externalize() normal paths but only leaves it in unp_externalize()
> > error path. Also we don't need to simultaneously hold both fdplock()
> > and `unp_lock' in unp_internalize(). As non obvious profit this
> > simplifies the future lock dances in the UNIX sockets layer.
> > 
> > It's safe to call fptounp() without `unp_lock' held. We always got this
> > file descriptor by fd_getfile(9) so we always have the extra reference
> > and this descriptor can't be closed by concurrent thread. Some sockets
> > could be destroyed through 'PRU_ABORT' path but they don't have
> > associated file descriptor and they are not accessible in the
> > unp_internalize() path.
> > 
> > The `unp_file' access without `unp_lock' held is also safe. Each socket
> > could have the only associated file descriptor and each file descriptor
> > could have the only associated socket. We only assign `unp_file' in the
> > unp_internalize() path where we got the socket by fd_getfile(9). This
> > descriptor has the extra reference and couldn't be closed concurrently.
> > We could override `unp_file' but with the same address because the
> > associated file descriptor can't be changed so the address will be also
> > the same. So while unp_gc() concurrently runs the dereference of
> > non-NULL `unp_file' is always safe.
> 
> Using an atomic operation for `unp_msgcount' is ok with me, one comment
> about `unp_rights' below.
> 
> > Index: sys/kern/uipc_usrreq.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.153
> > diff -u -p -r1.153 uipc_usrreq.c
> > --- sys/kern/uipc_usrreq.c  30 Oct 2021 16:35:31 -0000      1.153
> > +++ sys/kern/uipc_usrreq.c  30 Oct 2021 18:41:25 -0000
> > @@ -58,6 +58,7 @@
> >   * Locks used to protect global data and struct members:
> >   *      I       immutable after creation
> >   *      U       unp_lock
> > + *      a       atomic
> >   */
> >  struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
> >  
> > @@ -99,7 +100,7 @@ SLIST_HEAD(,unp_deferral)        unp_deferred =
> >     SLIST_HEAD_INITIALIZER(unp_deferred);
> >  
> >  ino_t      unp_ino;        /* [U] prototype for fake inode numbers */
> > -int        unp_rights;     /* [U] file descriptors in flight */
> > +int        unp_rights;     /* [a] file descriptors in flight */
> >  int        unp_defer;      /* [U] number of deferred fp to close by the GC 
> > task */
> >  int        unp_gcing;      /* [U] GC task currently running */
> >  
> > @@ -927,17 +928,16 @@ restart:
> >      */
> >     rp = (struct fdpass *)CMSG_DATA(cm);
> >  
> > -   rw_enter_write(&unp_lock);
> >     for (i = 0; i < nfds; i++) {
> >             struct unpcb *unp;
> >  
> >             fp = rp->fp;
> >             rp++;
> >             if ((unp = fptounp(fp)) != NULL)
> > -                   unp->unp_msgcount--;
> > -           unp_rights--;
> > +                   atomic_dec_long(&unp->unp_msgcount);
> >     }
> > -   rw_exit_write(&unp_lock);
> > +
> > +   atomic_sub_int(&unp_rights, nfds);
> >  
> >     /*
> >      * Copy temporary array to message and adjust length, in case of
> > @@ -985,13 +985,10 @@ unp_internalize(struct mbuf *control, st
> >             return (EINVAL);
> >     nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
> >  
> > -   rw_enter_write(&unp_lock);
> > -   if (unp_rights + nfds > maxfiles / 10) {
> > -           rw_exit_write(&unp_lock);
> > +   if (atomic_add_int_nv(&unp_rights, nfds) > maxfiles / 10) {
> > +           atomic_sub_int(&unp_rights, nfds);
> 
> I can't believe this is race free. If two threads, T1 and T2, call
> atomic_add at the same time both might end up returning EMFILE even
> if only the first one currently does.  This could happen if T1 exceeds
> the limit and T2 does atomic_add on an already-exceeded `unp_rights'
> before T1 could do atomic_sub.
> 
> I suggest using a mutex to protect `unp_rights' instead to solve this
> issue.

Yes, that would be better.  Otherwise it would be trivial to DOS
anything that does file descriptor passing.

> >             return (EMFILE);
> >     }
> > -   unp_rights += nfds;
> > -   rw_exit_write(&unp_lock);
> >  
> >     /* Make sure we have room for the struct file pointers */
> >  morespace:
> > @@ -1031,7 +1028,6 @@ morespace:
> >     ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
> >     rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
> >     fdplock(fdp);
> > -   rw_enter_write(&unp_lock);
> >     for (i = 0; i < nfds; i++) {
> >             memcpy(&fd, ip, sizeof fd);
> >             ip--;
> > @@ -1056,15 +1052,13 @@ morespace:
> >             rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
> >             rp--;
> >             if ((unp = fptounp(fp)) != NULL) {
> > +                   atomic_inc_long(&unp->unp_msgcount);
> >                     unp->unp_file = fp;
> > -                   unp->unp_msgcount++;
> >             }
> >     }
> > -   rw_exit_write(&unp_lock);
> >     fdpunlock(fdp);
> >     return (0);
> >  fail:
> > -   rw_exit_write(&unp_lock);
> >     fdpunlock(fdp);
> >     if (fp != NULL)
> >             FRELE(fp, p);
> > @@ -1072,17 +1066,13 @@ fail:
> >     for ( ; i > 0; i--) {
> >             rp++;
> >             fp = rp->fp;
> > -           rw_enter_write(&unp_lock);
> >             if ((unp = fptounp(fp)) != NULL)
> > -                   unp->unp_msgcount--;
> > -           rw_exit_write(&unp_lock);
> > +                   atomic_dec_long(&unp->unp_msgcount);
> >             FRELE(fp, p);
> >     }
> >  
> >  nospace:
> > -   rw_enter_write(&unp_lock);
> > -   unp_rights -= nfds;
> > -   rw_exit_write(&unp_lock);
> > +   atomic_sub_int(&unp_rights, nfds);
> >  
> >     return (error);
> >  }
> > @@ -1105,21 +1095,21 @@ unp_gc(void *arg __unused)
> >     /* close any fds on the deferred list */
> >     while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
> >             SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
> > +           rw_exit_write(&unp_lock);
> >             for (i = 0; i < defer->ud_n; i++) {
> >                     fp = defer->ud_fp[i].fp;
> >                     if (fp == NULL)
> >                             continue;
> > +                   if ((unp = fptounp(fp)) != NULL)
> > +                           atomic_dec_long(&unp->unp_msgcount);
> > +                   atomic_dec_int(&unp_rights);
> >                      /* closef() expects a refcount of 2 */
> >                     FREF(fp);
> > -                   if ((unp = fptounp(fp)) != NULL)
> > -                           unp->unp_msgcount--;
> > -                   unp_rights--;
> > -                   rw_exit_write(&unp_lock);
> >                     (void) closef(fp, NULL);
> > -                   rw_enter_write(&unp_lock);
> >             }
> >             free(defer, M_TEMP, sizeof(*defer) +
> >                 sizeof(struct fdpass) * defer->ud_n);
> > +           rw_enter_write(&unp_lock);
> >     }
> >  
> >     unp_defer = 0;
> > Index: sys/sys/unpcb.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/unpcb.h,v
> > retrieving revision 1.18
> > diff -u -p -r1.18 unpcb.h
> > --- sys/sys/unpcb.h 10 Feb 2021 08:20:09 -0000      1.18
> > +++ sys/sys/unpcb.h 30 Oct 2021 18:41:25 -0000
> > @@ -60,19 +60,20 @@
> >   * Locks used to protect struct members:
> >   *      I       immutable after creation
> >   *      U       unp_lock
> > + *      a       atomic
> >   */
> >  
> >  
> >  struct     unpcb {
> >     struct  socket *unp_socket;     /* [I] pointer back to socket */
> >     struct  vnode *unp_vnode;       /* [U] if associated with file */
> > -   struct  file *unp_file;         /* [U] backpointer for unp_gc() */
> > +   struct  file *unp_file;         /* [a] backpointer for unp_gc() */
> >     struct  unpcb *unp_conn;        /* [U] control block of connected 
> > socket */
> >     ino_t   unp_ino;                /* [U] fake inode number */
> >     SLIST_HEAD(,unpcb) unp_refs;    /* [U] referencing socket linked list */
> >     SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
> >     struct  mbuf *unp_addr;         /* [U] bound address of socket */
> > -   long    unp_msgcount;           /* [U] references from socket rcv buf */
> > +   long    unp_msgcount;           /* [a] references from socket rcv buf */
> >     int     unp_flags;              /* [U] this unpcb contains peer eids */
> >     struct  sockpeercred unp_connid;/* [U] id of peer process */
> >     struct  timespec unp_ctime;     /* [I] holds creation time */
> > 
> 
> 

Reply via email to