On Thu, Feb 13, 2025 at 08:42:02PM +0100, Claudio Jeker wrote:
> On Thu, Feb 13, 2025 at 08:13:27PM +0300, Vitaliy Makkoveev wrote:
> > > On 13 Feb 2025, at 19:33, Claudio Jeker <cje...@diehard.n-r-g.com> wrote:
> > > 
> > > On Wed, Feb 12, 2025 at 06:47:36PM +0300, Vitaliy Makkoveev wrote:
> > >> Hi,
> > >> 
> > >> This diff serializes tun(4)/tap(4) interfaces creation and destruction
> > >> from ifconfig(8) and device nodes path. Not only simplifies, but fixes
> > >> syzkaller [1] report.
> > >> 
> > >> Can I ask openvpn or vmd(8) users to test it and prove that there is no
> > >> behavior changes?
> > > 
> > > Just to be clear the diff changes the way tun(4) and tap(4) interfaces are
> > > created and destroyed. So it matters on how the interface is created and
> > > if it does behave properly.
> > 
> > The original creation and destruction paths of tun(4) are full of
> > sleep points, so those paths have dances to protect half-created
> > or half-destroyed interface. This diff uses the dedicated rwlock
> > for cloners in the device nodes handlers to make races impossible.
> > 
> > > I think the things to test are:
> > > - If you open /dev/tun1 is tun1 created and if you close /dev/tun1 is
> > > it destroyed again.
> > 
> > It works.
> > 
> > > - If tun1 alreay exists on open of /dev/tun1 then the last close of
> > > /dev/tun1 should not destroy tun1.
> > 
> > This is not the current behaviour. The TUN_STAYUP exists to prevent
> > the concurrent destruction from tun_dev_close() while we are in the
> > middle of tun_dev_open(). This could happen from the VOP_REWOKE()
> > path of tun_clone_destroy(). In this case the interface will be
> > destroyed, but not by tun_dev_close().
> 
> It is very much the current behaviour. I use this every day. I have
> preconfigured tap0 and tap1 interfaces that are always around.
> So this is very much current behaviour.
>  
> > > - If you have /dev/tun1 open and destroy tun1 (the interface) are you
> > > notified that /dev/tun1 is no longer available.
> > > 
> > 
> > Only tun_dev_read() should be notified, and yes, it works.
> > 
> > > I think there are other cases to consider, those are just the obvious ones
> > > that I remember. I glanced at the diff, it changes a hell of a lot and the
> > > bit that scares me is that TUN_STAYUP becomes unused and looking at
> > > tun_dev_close() I think this is indeed not right. In other words I think
> > > the 2nd bit from the list above is violated.
> > 
> > As I previously said, the TUN_STAYUP bit required only to prevent
> > concurrent destruction in the middle of tun_dev_open(). Now
> > concurrent destruction is impossible.
> 
> I don't think your right at least I always understood that TUN_STAYUP was
> added exactly to prevent the destruction of an interface that was created
> through interface cloning.
> 

I see. Current tun creation and destruction is complicated, because we
need to deal with concurrent threads. I looked at TUN_STAYUP flag from
this point and found it useless, because in the race we can't predict 
the winner. So I decided to always destroy interface at close. This
updated diff restores existing behavior with TUN_STAYUP flag.

The diff doesn't change a lot. It only makes `if_cloners_lock' external
to if_clone_{create,destroy}() so now we could always say who is the tun
instance creator. This makes things much simple.

I could commit ifq hunks first. This should decrease the diff size.

Index: sys/net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.726
diff -u -p -r1.726 if.c
--- sys/net/if.c        3 Feb 2025 08:58:52 -0000       1.726
+++ sys/net/if.c        13 Feb 2025 22:33:28 -0000
@@ -1310,45 +1310,55 @@ if_isconnected(const struct ifnet *ifp0,
  * Create a clone network interface.
  */
 int
-if_clone_create(const char *name, int rdomain)
+if_clone_create_locked(const char *name, int rdomain)
 {
        struct if_clone *ifc;
        struct ifnet *ifp;
        int unit, ret;
 
+       rw_assert_wrlock(&if_cloners_lock);
+
        ifc = if_clone_lookup(name, &unit);
        if (ifc == NULL)
                return (EINVAL);
-
-       rw_enter_write(&if_cloners_lock);
-
        if ((ifp = if_unit(name)) != NULL) {
                ret = EEXIST;
-               goto unlock;
+               goto put;
        }
 
        ret = (*ifc->ifc_create)(ifc, unit);
 
        if (ret != 0 || (ifp = if_unit(name)) == NULL)
-               goto unlock;
+               goto put;
 
        NET_LOCK();
        if_addgroup(ifp, ifc->ifc_name);
        if (rdomain != 0)
                if_setrdomain(ifp, rdomain);
        NET_UNLOCK();
-unlock:
-       rw_exit_write(&if_cloners_lock);
+put:
        if_put(ifp);
 
        return (ret);
 }
 
+int
+if_clone_create(const char *name, int rdomain)
+{
+       int error;
+
+       rw_enter_write(&if_cloners_lock);
+       error = if_clone_create_locked(name, rdomain);
+       rw_exit_write(&if_cloners_lock);
+
+       return (error);
+}
+
 /*
  * Destroy a clone network interface.
  */
 int
-if_clone_destroy(const char *name)
+if_clone_destroy_locked(const char *name)
 {
        struct if_clone *ifc;
        struct ifnet *ifp;
@@ -1361,16 +1371,14 @@ if_clone_destroy(const char *name)
        if (ifc->ifc_destroy == NULL)
                return (EOPNOTSUPP);
 
-       rw_enter_write(&if_cloners_lock);
+       rw_assert_wrlock(&if_cloners_lock);
 
        TAILQ_FOREACH(ifp, &ifnetlist, if_list) {
                if (strcmp(ifp->if_xname, name) == 0)
                        break;
        }
-       if (ifp == NULL) {
-               rw_exit_write(&if_cloners_lock);
+       if (ifp == NULL)
                return (ENXIO);
-       }
 
        NET_LOCK();
        if (ifp->if_flags & IFF_UP) {
@@ -1382,9 +1390,19 @@ if_clone_destroy(const char *name)
        NET_UNLOCK();
        ret = (*ifc->ifc_destroy)(ifp);
 
+       return (ret);
+}
+
+int
+if_clone_destroy(const char *name)
+{
+       int error;
+
+       rw_enter_write(&if_cloners_lock);
+       error = if_clone_destroy_locked(name);
        rw_exit_write(&if_cloners_lock);
 
-       return (ret);
+       return (error);
 }
 
 /*
Index: sys/net/if_tun.c
===================================================================
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.250
diff -u -p -r1.250 if_tun.c
--- sys/net/if_tun.c    30 Dec 2024 02:46:00 -0000      1.250
+++ sys/net/if_tun.c    13 Feb 2025 22:33:28 -0000
@@ -115,7 +115,7 @@ int tundebug = TUN_DEBUG;
 void   tunattach(int);
 
 int    tun_dev_open(dev_t, const struct if_clone *, int, struct proc *);
-int    tun_dev_close(dev_t, struct proc *);
+int    tun_dev_close(dev_t, const struct if_clone *, struct proc *);
 int    tun_dev_ioctl(dev_t, u_long, void *);
 int    tun_dev_read(dev_t, struct uio *, int);
 int    tun_dev_write(dev_t, struct uio *, int, int);
@@ -284,11 +284,7 @@ tun_create(struct if_clone *ifc, int uni
        }
 
        sigio_init(&sc->sc_sigio);
-
-       /* tell tun_dev_open we're initialised */
-
-       sc->sc_flags |= TUN_INITED|TUN_STAYUP;
-       wakeup(sc);
+       sc->sc_flags |= TUN_STAYUP;
 
        return (0);
 
@@ -303,25 +299,10 @@ int
 tun_clone_destroy(struct ifnet *ifp)
 {
        struct tun_softc        *sc = ifp->if_softc;
-       dev_t                    dev;
 
        KERNEL_ASSERT_LOCKED();
 
-       if (ISSET(sc->sc_flags, TUN_DEAD))
-               return (ENXIO);
        SET(sc->sc_flags, TUN_DEAD);
-
-       /* kick userland off the device */
-       dev = sc->sc_dev;
-       if (dev) {
-               struct vnode *vp;
-
-               if (vfinddev(dev, VCHR, &vp))
-                       VOP_REVOKE(vp, REVOKEALL);
-
-               KASSERT(sc->sc_dev == 0);
-       }
-
        /* prevent userland from getting to the device again */
        SMR_LIST_REMOVE_LOCKED(sc, sc_entry);
        smr_barrier();
@@ -390,70 +371,29 @@ tun_dev_open(dev_t dev, const struct if_
        struct tun_softc *sc;
        struct ifnet *ifp;
        int error;
-       u_short stayup = 0;
-       struct vnode *vp;
-
        char name[IFNAMSIZ];
-       unsigned int rdomain;
-
-       /*
-        * Find the vnode associated with this open before we sleep
-        * and let something else revoke it. Our caller has a reference
-        * to it so we don't need to account for it.
-        */
-       if (!vfinddev(dev, VCHR, &vp))
-               panic("%s vfinddev failed", __func__);
 
        snprintf(name, sizeof(name), "%s%u", ifc->ifc_name, minor(dev));
-       rdomain = rtable_l2(p->p_p->ps_rtableid);
-
-       /* let's find or make an interface to work with */
-       while ((sc = tun_name_lookup(name)) == NULL) {
-               error = if_clone_create(name, rdomain);
-               switch (error) {
-               case 0: /* it's probably ours */
-                       stayup = TUN_STAYUP;
-                       /* FALLTHROUGH */
-               case EEXIST: /* we may have lost a race with someone else */
-                       break;
-               default:
-                       return (error);
-               }
-       }
-
-       refcnt_take(&sc->sc_refs);
 
-       /* wait for it to be fully constructed before we use it */
-       for (;;) {
-               if (ISSET(sc->sc_flags, TUN_DEAD)) {
-                       error = ENXIO;
-                       goto done;
-               }
+       rw_enter_write(&if_cloners_lock);
 
-               if (ISSET(sc->sc_flags, TUN_INITED))
-                       break;
+       if ((sc = tun_name_lookup(name)) == NULL) {
+               unsigned int rdomain = rtable_l2(p->p_p->ps_rtableid);
 
-               error = tsleep_nsec(sc, PCATCH, "tuninit", INFSLP);
-               if (error != 0) {
-                       /* XXX if_clone_destroy if stayup? */
-                       goto done;
-               }
-       }
+               if ((error = if_clone_create_locked(name, rdomain)) != 0)
+                       goto out;
 
-       /* Has tun_clone_destroy torn the rug out under us? */
-       if (vp->v_type == VBAD) {
-               error = ENXIO;
-               goto done;
+               sc = tun_name_lookup(name);
+               CLR(sc->sc_flags, TUN_STAYUP);
        }
 
        if (sc->sc_dev != 0) {
-               /* aww, we lost */
+               /* already opened */
                error = EBUSY;
-               goto done;
+               goto out;
        }
        /* it's ours now */
        sc->sc_dev = dev;
-       CLR(sc->sc_flags, stayup);
 
        /* automatically mark the interface running on open */
        ifp = &sc->sc_if;
@@ -461,72 +401,61 @@ tun_dev_open(dev_t dev, const struct if_
        SET(ifp->if_flags, IFF_UP | IFF_RUNNING);
        NET_UNLOCK();
        tun_link_state(ifp, LINK_STATE_FULL_DUPLEX);
-       error = 0;
+out:
+       rw_exit_write(&if_cloners_lock);
 
-done:
-       tun_put(sc);
        return (error);
 }
 
-/*
- * tunclose - close the device; if closing the real device, flush pending
- *  output and unless STAYUP bring down and destroy the interface.
- */
 int
 tunclose(dev_t dev, int flag, int mode, struct proc *p)
 {
-       return (tun_dev_close(dev, p));
+       return (tun_dev_close(dev, &tun_cloner, p));
 }
 
 int
 tapclose(dev_t dev, int flag, int mode, struct proc *p)
 {
-       return (tun_dev_close(dev, p));
+       return (tun_dev_close(dev, &tap_cloner, p));
 }
 
 int
-tun_dev_close(dev_t dev, struct proc *p)
+tun_dev_close(dev_t dev, const struct if_clone *ifc, struct proc *p)
 {
        struct tun_softc        *sc;
        struct ifnet            *ifp;
        int                      error = 0;
        char                     name[IFNAMSIZ];
-       int                      destroy = 0;
 
-       sc = tun_get(dev);
-       if (sc == NULL)
-               return (ENXIO);
+       snprintf(name, sizeof(name), "%s%u", ifc->ifc_name, minor(dev));
 
-       ifp = &sc->sc_if;
+       rw_enter_write(&if_cloners_lock);
 
-       /*
-        * junk all pending output
-        */
-       NET_LOCK();
-       CLR(ifp->if_flags, IFF_UP | IFF_RUNNING);
-       CLR(ifp->if_capabilities, TUN_IF_CAPS);
-       NET_UNLOCK();
-       ifq_purge(&ifp->if_snd);
+       if ((sc = tun_name_lookup(name)) == NULL) {
+               error = ENXIO;
+               goto out;
+       }
 
-       CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR);
-       sigio_free(&sc->sc_sigio);
+       if (ISSET(sc->sc_flags, TUN_STAYUP)) {
+               ifp = &sc->sc_if;
 
-       if (!ISSET(sc->sc_flags, TUN_DEAD)) {
-               /* we can't hold a reference to sc before we start a dtor */
-               if (!ISSET(sc->sc_flags, TUN_STAYUP)) {
-                       destroy = 1;
-                       strlcpy(name, ifp->if_xname, sizeof(name));
-               } else {
-                       tun_link_state(ifp, LINK_STATE_DOWN);
-               }
-       }
+               /* junk all pending output */
+               NET_LOCK();
+               CLR(ifp->if_flags, IFF_UP | IFF_RUNNING);
+               CLR(ifp->if_capabilities, TUN_IF_CAPS);
+               NET_UNLOCK();
+               ifq_purge(&ifp->if_snd);
 
-       sc->sc_dev = 0;
+               CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR);
+               sigio_free(&sc->sc_sigio);
 
-       tun_put(sc);
+               tun_link_state(ifp, LINK_STATE_DOWN);
+               sc->sc_dev = 0;
+       } else
+               if_clone_destroy_locked(name);
 
-       if (destroy)
-               if_clone_destroy(name);
+out:
+       rw_exit_write(&if_cloners_lock);
 
        return (error);
 }
@@ -831,10 +760,24 @@ tun_dev_read(dev_t dev, struct uio *uio,
 
        ifp = &sc->sc_if;
 
-       error = ifq_deq_sleep(&ifp->if_snd, &m0, ISSET(ioflag, IO_NDELAY),
-           (PZERO + 1)|PCATCH, "tunread", &sc->sc_reading, &sc->sc_dev);
-       if (error != 0)
-               goto put;
+       while ((m0 = ifq_dequeue(&ifp->if_snd)) == NULL) {
+               if (ISSET(ioflag, IO_NDELAY)) {
+                       error = EWOULDBLOCK;
+                       goto put;
+               }
+
+               sc->sc_reading = 1;
+               error = tsleep_nsec(&ifp->if_snd, (PZERO + 1) | PCATCH,
+                   "tunread", INFSLP);
+               sc->sc_reading = 0;
+
+               if (error != 0)
+                       goto put;
+               if (ISSET(sc->sc_flags, TUN_DEAD)) {
+                       error = ENXIO;
+                       goto put;
+               }
+       }
 
 #if NBPFILTER > 0
        if (ifp->if_bpf)
Index: sys/net/if_tun.h
===================================================================
RCS file: /cvs/src/sys/net/if_tun.h,v
retrieving revision 1.18
diff -u -p -r1.18 if_tun.h
--- sys/net/if_tun.h    17 Nov 2024 00:25:07 -0000      1.18
+++ sys/net/if_tun.h    13 Feb 2025 22:33:28 -0000
@@ -75,7 +75,7 @@ struct tun_hdr {
 };
 
 #define        TUN_OPEN        0x0001
-#define        TUN_INITED      0x0002
+#define        TUN_INITED      0x0002  /* unused */
 #define        TUN_RCOLL       0x0004  /* unused */
 #define        TUN_IASET       0x0008
 #define        TUN_DSTADDR     0x0010
Index: sys/net/if_var.h
===================================================================
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.135
diff -u -p -r1.135 if_var.h
--- sys/net/if_var.h    24 Jan 2025 09:19:07 -0000      1.135
+++ sys/net/if_var.h    13 Feb 2025 22:33:28 -0000
@@ -325,6 +325,7 @@ int         niq_enlist(struct niqueue *, struct
     sysctl_mq((_n), (_l), (_op), (_olp), (_np), (_nl), &(_niq)->ni_q)
 
 extern struct ifnet_head ifnetlist;
+extern struct rwlock if_cloners_lock;
 
 void   if_start(struct ifnet *);
 int    if_enqueue(struct ifnet *, struct mbuf *);
@@ -355,6 +356,8 @@ int if_isconnected(const struct ifnet *,
 
 void   if_clone_attach(struct if_clone *);
 
+int    if_clone_create_locked(const char *, int);
+int    if_clone_destroy_locked(const char *);
 int    if_clone_create(const char *, int);
 int    if_clone_destroy(const char *);
 
Index: sys/net/ifq.c
===================================================================
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.56
diff -u -p -r1.56 ifq.c
--- sys/net/ifq.c       3 Feb 2025 08:58:52 -0000       1.56
+++ sys/net/ifq.c       13 Feb 2025 22:33:28 -0000
@@ -478,45 +478,6 @@ ifq_dequeue(struct ifqueue *ifq)
 }
 
 int
-ifq_deq_sleep(struct ifqueue *ifq, struct mbuf **mp, int nbio, int priority,
-    const char *wmesg, volatile unsigned int *sleeping,
-    volatile unsigned int *alive)
-{
-       struct mbuf *m;
-       void *cookie;
-       int error = 0;
-
-       ifq_deq_enter(ifq);
-       if (ifq->ifq_len == 0 && nbio)
-               error = EWOULDBLOCK;
-       else {
-               for (;;) {
-                       m = ifq->ifq_ops->ifqop_deq_begin(ifq, &cookie);
-                       if (m != NULL) {
-                               ifq->ifq_ops->ifqop_deq_commit(ifq, m, cookie);
-                               ifq->ifq_len--;
-                               *mp = m;
-                               break;
-                       }
-
-                       (*sleeping)++;
-                       error = msleep_nsec(ifq, &ifq->ifq_mtx,
-                           priority, wmesg, INFSLP);
-                       (*sleeping)--;
-                       if (error != 0)
-                               break;
-                       if (!(*alive)) {
-                               error = EIO;
-                               break;
-                       }
-               }
-       }
-       ifq_deq_leave(ifq);
-
-       return (error);
-}
-
-int
 ifq_hdatalen(struct ifqueue *ifq)
 {
        struct mbuf *m;
Index: sys/net/ifq.h
===================================================================
RCS file: /cvs/src/sys/net/ifq.h,v
retrieving revision 1.43
diff -u -p -r1.43 ifq.h
--- sys/net/ifq.h       3 Feb 2025 08:58:52 -0000       1.43
+++ sys/net/ifq.h       13 Feb 2025 22:33:28 -0000
@@ -446,10 +446,6 @@ void                ifq_barrier(struct ifqueue *);
 void            ifq_set_oactive(struct ifqueue *);
 void            ifq_deq_set_oactive(struct ifqueue *);
 
-int             ifq_deq_sleep(struct ifqueue *, struct mbuf **, int, int,
-                    const char *, volatile unsigned int *,
-                    volatile unsigned int *);
-
 #define ifq_len(_ifq)          READ_ONCE((_ifq)->ifq_len)
 #define ifq_empty(_ifq)                (ifq_len(_ifq) == 0)
 

Reply via email to