On Thu, Feb 13, 2025 at 08:42:02PM +0100, Claudio Jeker wrote: > On Thu, Feb 13, 2025 at 08:13:27PM +0300, Vitaliy Makkoveev wrote: > > > On 13 Feb 2025, at 19:33, Claudio Jeker <cje...@diehard.n-r-g.com> wrote: > > > > > > On Wed, Feb 12, 2025 at 06:47:36PM +0300, Vitaliy Makkoveev wrote: > > >> Hi, > > >> > > >> This diff serializes tun(4)/tap(4) interfaces creation and destruction > > >> from ifconfig(8) and device nodes path. Not only simplifies, but fixes > > >> syzkaller [1] report. > > >> > > >> Can I ask openvpn or vmd(8) users to test it and prove that there is no > > >> behavior changes? > > > > > > Just to be clear the diff changes the way tun(4) and tap(4) interfaces are > > > created and destroyed. So it matters on how the interface is created and > > > if it does behave properly. > > > > The original creation and destruction paths of tun(4) are full of > > sleep points, so those paths have dances to protect half-created > > or half-destroyed interface. This diff uses the dedicated rwlock > > for cloners in the device nodes handlers to make races impossible. > > > > > I think the things to test are: > > > - If you open /dev/tun1 is tun1 created and if you close /dev/tun1 is > > > it destroyed again. > > > > It works. > > > > > - If tun1 alreay exists on open of /dev/tun1 then the last close of > > > /dev/tun1 should not destroy tun1. > > > > This is not the current behaviour. The TUN_STAYUP exists to prevent > > the concurrent destruction from tun_dev_close() while we are in the > > middle of tun_dev_open(). This could happen from the VOP_REWOKE() > > path of tun_clone_destroy(). In this case the interface will be > > destroyed, but not by tun_dev_close(). > > It is very much the current behaviour. I use this every day. I have > preconfigured tap0 and tap1 interfaces that are always around. > So this is very much current behaviour. > > > > - If you have /dev/tun1 open and destroy tun1 (the interface) are you > > > notified that /dev/tun1 is no longer available. > > > > > > > Only tun_dev_read() should be notified, and yes, it works. > > > > > I think there are other cases to consider, those are just the obvious ones > > > that I remember. I glanced at the diff, it changes a hell of a lot and the > > > bit that scares me is that TUN_STAYUP becomes unused and looking at > > > tun_dev_close() I think this is indeed not right. In other words I think > > > the 2nd bit from the list above is violated. > > > > As I previously said, the TUN_STAYUP bit required only to prevent > > concurrent destruction in the middle of tun_dev_open(). Now > > concurrent destruction is impossible. > > I don't think your right at least I always understood that TUN_STAYUP was > added exactly to prevent the destruction of an interface that was created > through interface cloning. >
I see. Current tun creation and destruction is complicated, because we need to deal with concurrent threads. I looked at TUN_STAYUP flag from this point and found it useless, because in the race we can't predict the winner. So I decided to always destroy interface at close. This updated diff restores existing behavior with TUN_STAYUP flag. The diff doesn't change a lot. It only makes `if_cloners_lock' external to if_clone_{create,destroy}() so now we could always say who is the tun instance creator. This makes things much simple. I could commit ifq hunks first. This should decrease the diff size. Index: sys/net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.726 diff -u -p -r1.726 if.c --- sys/net/if.c 3 Feb 2025 08:58:52 -0000 1.726 +++ sys/net/if.c 13 Feb 2025 22:33:28 -0000 @@ -1310,45 +1310,55 @@ if_isconnected(const struct ifnet *ifp0, * Create a clone network interface. */ int -if_clone_create(const char *name, int rdomain) +if_clone_create_locked(const char *name, int rdomain) { struct if_clone *ifc; struct ifnet *ifp; int unit, ret; + rw_assert_wrlock(&if_cloners_lock); + ifc = if_clone_lookup(name, &unit); if (ifc == NULL) return (EINVAL); - - rw_enter_write(&if_cloners_lock); - if ((ifp = if_unit(name)) != NULL) { ret = EEXIST; - goto unlock; + goto put; } ret = (*ifc->ifc_create)(ifc, unit); if (ret != 0 || (ifp = if_unit(name)) == NULL) - goto unlock; + goto put; NET_LOCK(); if_addgroup(ifp, ifc->ifc_name); if (rdomain != 0) if_setrdomain(ifp, rdomain); NET_UNLOCK(); -unlock: - rw_exit_write(&if_cloners_lock); +put: if_put(ifp); return (ret); } +int +if_clone_create(const char *name, int rdomain) +{ + int error; + + rw_enter_write(&if_cloners_lock); + error = if_clone_create_locked(name, rdomain); + rw_exit_write(&if_cloners_lock); + + return (error); +} + /* * Destroy a clone network interface. */ int -if_clone_destroy(const char *name) +if_clone_destroy_locked(const char *name) { struct if_clone *ifc; struct ifnet *ifp; @@ -1361,16 +1371,14 @@ if_clone_destroy(const char *name) if (ifc->ifc_destroy == NULL) return (EOPNOTSUPP); - rw_enter_write(&if_cloners_lock); + rw_assert_wrlock(&if_cloners_lock); TAILQ_FOREACH(ifp, &ifnetlist, if_list) { if (strcmp(ifp->if_xname, name) == 0) break; } - if (ifp == NULL) { - rw_exit_write(&if_cloners_lock); + if (ifp == NULL) return (ENXIO); - } NET_LOCK(); if (ifp->if_flags & IFF_UP) { @@ -1382,9 +1390,19 @@ if_clone_destroy(const char *name) NET_UNLOCK(); ret = (*ifc->ifc_destroy)(ifp); + return (ret); +} + +int +if_clone_destroy(const char *name) +{ + int error; + + rw_enter_write(&if_cloners_lock); + error = if_clone_destroy_locked(name); rw_exit_write(&if_cloners_lock); - return (ret); + return (error); } /* Index: sys/net/if_tun.c =================================================================== RCS file: /cvs/src/sys/net/if_tun.c,v retrieving revision 1.250 diff -u -p -r1.250 if_tun.c --- sys/net/if_tun.c 30 Dec 2024 02:46:00 -0000 1.250 +++ sys/net/if_tun.c 13 Feb 2025 22:33:28 -0000 @@ -115,7 +115,7 @@ int tundebug = TUN_DEBUG; void tunattach(int); int tun_dev_open(dev_t, const struct if_clone *, int, struct proc *); -int tun_dev_close(dev_t, struct proc *); +int tun_dev_close(dev_t, const struct if_clone *, struct proc *); int tun_dev_ioctl(dev_t, u_long, void *); int tun_dev_read(dev_t, struct uio *, int); int tun_dev_write(dev_t, struct uio *, int, int); @@ -284,11 +284,7 @@ tun_create(struct if_clone *ifc, int uni } sigio_init(&sc->sc_sigio); - - /* tell tun_dev_open we're initialised */ - - sc->sc_flags |= TUN_INITED|TUN_STAYUP; - wakeup(sc); + sc->sc_flags |= TUN_STAYUP; return (0); @@ -303,25 +299,10 @@ int tun_clone_destroy(struct ifnet *ifp) { struct tun_softc *sc = ifp->if_softc; - dev_t dev; KERNEL_ASSERT_LOCKED(); - if (ISSET(sc->sc_flags, TUN_DEAD)) - return (ENXIO); SET(sc->sc_flags, TUN_DEAD); - - /* kick userland off the device */ - dev = sc->sc_dev; - if (dev) { - struct vnode *vp; - - if (vfinddev(dev, VCHR, &vp)) - VOP_REVOKE(vp, REVOKEALL); - - KASSERT(sc->sc_dev == 0); - } - /* prevent userland from getting to the device again */ SMR_LIST_REMOVE_LOCKED(sc, sc_entry); smr_barrier(); @@ -390,70 +371,29 @@ tun_dev_open(dev_t dev, const struct if_ struct tun_softc *sc; struct ifnet *ifp; int error; - u_short stayup = 0; - struct vnode *vp; - char name[IFNAMSIZ]; - unsigned int rdomain; - - /* - * Find the vnode associated with this open before we sleep - * and let something else revoke it. Our caller has a reference - * to it so we don't need to account for it. - */ - if (!vfinddev(dev, VCHR, &vp)) - panic("%s vfinddev failed", __func__); snprintf(name, sizeof(name), "%s%u", ifc->ifc_name, minor(dev)); - rdomain = rtable_l2(p->p_p->ps_rtableid); - - /* let's find or make an interface to work with */ - while ((sc = tun_name_lookup(name)) == NULL) { - error = if_clone_create(name, rdomain); - switch (error) { - case 0: /* it's probably ours */ - stayup = TUN_STAYUP; - /* FALLTHROUGH */ - case EEXIST: /* we may have lost a race with someone else */ - break; - default: - return (error); - } - } - - refcnt_take(&sc->sc_refs); - /* wait for it to be fully constructed before we use it */ - for (;;) { - if (ISSET(sc->sc_flags, TUN_DEAD)) { - error = ENXIO; - goto done; - } + rw_enter_write(&if_cloners_lock); - if (ISSET(sc->sc_flags, TUN_INITED)) - break; + if ((sc = tun_name_lookup(name)) == NULL) { + unsigned int rdomain = rtable_l2(p->p_p->ps_rtableid); - error = tsleep_nsec(sc, PCATCH, "tuninit", INFSLP); - if (error != 0) { - /* XXX if_clone_destroy if stayup? */ - goto done; - } - } + if ((error = if_clone_create_locked(name, rdomain)) != 0) + goto out; - /* Has tun_clone_destroy torn the rug out under us? */ - if (vp->v_type == VBAD) { - error = ENXIO; - goto done; + sc = tun_name_lookup(name); + CLR(sc->sc_flags, TUN_STAYUP); } if (sc->sc_dev != 0) { - /* aww, we lost */ + /* already opened */ error = EBUSY; - goto done; + goto out; } /* it's ours now */ sc->sc_dev = dev; - CLR(sc->sc_flags, stayup); /* automatically mark the interface running on open */ ifp = &sc->sc_if; @@ -461,72 +401,61 @@ tun_dev_open(dev_t dev, const struct if_ SET(ifp->if_flags, IFF_UP | IFF_RUNNING); NET_UNLOCK(); tun_link_state(ifp, LINK_STATE_FULL_DUPLEX); - error = 0; +out: + rw_exit_write(&if_cloners_lock); -done: - tun_put(sc); return (error); } -/* - * tunclose - close the device; if closing the real device, flush pending - * output and unless STAYUP bring down and destroy the interface. - */ int tunclose(dev_t dev, int flag, int mode, struct proc *p) { - return (tun_dev_close(dev, p)); + return (tun_dev_close(dev, &tun_cloner, p)); } int tapclose(dev_t dev, int flag, int mode, struct proc *p) { - return (tun_dev_close(dev, p)); + return (tun_dev_close(dev, &tap_cloner, p)); } int -tun_dev_close(dev_t dev, struct proc *p) +tun_dev_close(dev_t dev, const struct if_clone *ifc, struct proc *p) { struct tun_softc *sc; struct ifnet *ifp; int error = 0; char name[IFNAMSIZ]; - int destroy = 0; - sc = tun_get(dev); - if (sc == NULL) - return (ENXIO); + snprintf(name, sizeof(name), "%s%u", ifc->ifc_name, minor(dev)); - ifp = &sc->sc_if; + rw_enter_write(&if_cloners_lock); - /* - * junk all pending output - */ - NET_LOCK(); - CLR(ifp->if_flags, IFF_UP | IFF_RUNNING); - CLR(ifp->if_capabilities, TUN_IF_CAPS); - NET_UNLOCK(); - ifq_purge(&ifp->if_snd); + if ((sc = tun_name_lookup(name)) == NULL) { + error = ENXIO; + goto out; + } - CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR); - sigio_free(&sc->sc_sigio); + if (ISSET(sc->sc_flags, TUN_STAYUP)) { + ifp = &sc->sc_if; - if (!ISSET(sc->sc_flags, TUN_DEAD)) { - /* we can't hold a reference to sc before we start a dtor */ - if (!ISSET(sc->sc_flags, TUN_STAYUP)) { - destroy = 1; - strlcpy(name, ifp->if_xname, sizeof(name)); - } else { - tun_link_state(ifp, LINK_STATE_DOWN); - } - } + /* junk all pending output */ + NET_LOCK(); + CLR(ifp->if_flags, IFF_UP | IFF_RUNNING); + CLR(ifp->if_capabilities, TUN_IF_CAPS); + NET_UNLOCK(); + ifq_purge(&ifp->if_snd); - sc->sc_dev = 0; + CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR); + sigio_free(&sc->sc_sigio); - tun_put(sc); + tun_link_state(ifp, LINK_STATE_DOWN); + sc->sc_dev = 0; + } else + if_clone_destroy_locked(name); - if (destroy) - if_clone_destroy(name); +out: + rw_exit_write(&if_cloners_lock); return (error); } @@ -831,10 +760,24 @@ tun_dev_read(dev_t dev, struct uio *uio, ifp = &sc->sc_if; - error = ifq_deq_sleep(&ifp->if_snd, &m0, ISSET(ioflag, IO_NDELAY), - (PZERO + 1)|PCATCH, "tunread", &sc->sc_reading, &sc->sc_dev); - if (error != 0) - goto put; + while ((m0 = ifq_dequeue(&ifp->if_snd)) == NULL) { + if (ISSET(ioflag, IO_NDELAY)) { + error = EWOULDBLOCK; + goto put; + } + + sc->sc_reading = 1; + error = tsleep_nsec(&ifp->if_snd, (PZERO + 1) | PCATCH, + "tunread", INFSLP); + sc->sc_reading = 0; + + if (error != 0) + goto put; + if (ISSET(sc->sc_flags, TUN_DEAD)) { + error = ENXIO; + goto put; + } + } #if NBPFILTER > 0 if (ifp->if_bpf) Index: sys/net/if_tun.h =================================================================== RCS file: /cvs/src/sys/net/if_tun.h,v retrieving revision 1.18 diff -u -p -r1.18 if_tun.h --- sys/net/if_tun.h 17 Nov 2024 00:25:07 -0000 1.18 +++ sys/net/if_tun.h 13 Feb 2025 22:33:28 -0000 @@ -75,7 +75,7 @@ struct tun_hdr { }; #define TUN_OPEN 0x0001 -#define TUN_INITED 0x0002 +#define TUN_INITED 0x0002 /* unused */ #define TUN_RCOLL 0x0004 /* unused */ #define TUN_IASET 0x0008 #define TUN_DSTADDR 0x0010 Index: sys/net/if_var.h =================================================================== RCS file: /cvs/src/sys/net/if_var.h,v retrieving revision 1.135 diff -u -p -r1.135 if_var.h --- sys/net/if_var.h 24 Jan 2025 09:19:07 -0000 1.135 +++ sys/net/if_var.h 13 Feb 2025 22:33:28 -0000 @@ -325,6 +325,7 @@ int niq_enlist(struct niqueue *, struct sysctl_mq((_n), (_l), (_op), (_olp), (_np), (_nl), &(_niq)->ni_q) extern struct ifnet_head ifnetlist; +extern struct rwlock if_cloners_lock; void if_start(struct ifnet *); int if_enqueue(struct ifnet *, struct mbuf *); @@ -355,6 +356,8 @@ int if_isconnected(const struct ifnet *, void if_clone_attach(struct if_clone *); +int if_clone_create_locked(const char *, int); +int if_clone_destroy_locked(const char *); int if_clone_create(const char *, int); int if_clone_destroy(const char *); Index: sys/net/ifq.c =================================================================== RCS file: /cvs/src/sys/net/ifq.c,v retrieving revision 1.56 diff -u -p -r1.56 ifq.c --- sys/net/ifq.c 3 Feb 2025 08:58:52 -0000 1.56 +++ sys/net/ifq.c 13 Feb 2025 22:33:28 -0000 @@ -478,45 +478,6 @@ ifq_dequeue(struct ifqueue *ifq) } int -ifq_deq_sleep(struct ifqueue *ifq, struct mbuf **mp, int nbio, int priority, - const char *wmesg, volatile unsigned int *sleeping, - volatile unsigned int *alive) -{ - struct mbuf *m; - void *cookie; - int error = 0; - - ifq_deq_enter(ifq); - if (ifq->ifq_len == 0 && nbio) - error = EWOULDBLOCK; - else { - for (;;) { - m = ifq->ifq_ops->ifqop_deq_begin(ifq, &cookie); - if (m != NULL) { - ifq->ifq_ops->ifqop_deq_commit(ifq, m, cookie); - ifq->ifq_len--; - *mp = m; - break; - } - - (*sleeping)++; - error = msleep_nsec(ifq, &ifq->ifq_mtx, - priority, wmesg, INFSLP); - (*sleeping)--; - if (error != 0) - break; - if (!(*alive)) { - error = EIO; - break; - } - } - } - ifq_deq_leave(ifq); - - return (error); -} - -int ifq_hdatalen(struct ifqueue *ifq) { struct mbuf *m; Index: sys/net/ifq.h =================================================================== RCS file: /cvs/src/sys/net/ifq.h,v retrieving revision 1.43 diff -u -p -r1.43 ifq.h --- sys/net/ifq.h 3 Feb 2025 08:58:52 -0000 1.43 +++ sys/net/ifq.h 13 Feb 2025 22:33:28 -0000 @@ -446,10 +446,6 @@ void ifq_barrier(struct ifqueue *); void ifq_set_oactive(struct ifqueue *); void ifq_deq_set_oactive(struct ifqueue *); -int ifq_deq_sleep(struct ifqueue *, struct mbuf **, int, int, - const char *, volatile unsigned int *, - volatile unsigned int *); - #define ifq_len(_ifq) READ_ONCE((_ifq)->ifq_len) #define ifq_empty(_ifq) (ifq_len(_ifq) == 0)