On Feb 28, 1:30 pm, to.my.troc...@gmail.com (Mikolaj Golub) wrote: > But I have faced with another issue (not related to your patch, as it is > observed with unpatched kernel too). When I try to run concurrently two > create/destroy scripts with the same interface the system panics: > > Unread portion of the kernel message buffer: > panic: Bad link elm 0xc5f1a800 next->prev != elm > cpuid = 2 > KDB: enter: panic > exclusive sleep mutex if_clone lock (if_clone lock) r = 0 (0xc0da1cf0) locked > @ /usr/src/sys/net/if_clone.c:248 > exclusive sleep mutex if_clone lock (if_clone lock) r = 0 (0xc0da1cf0) locked > @ /usr/src/sys/net/if_clone.c:248 > exclusive sx so_rcv_sx (so_rcv_sx) r = 0 (0xc6cd3560) locked @ > /usr/src/sys/kern/uipc_sockbuf.c:148 > exclusive sx so_rcv_sx (so_rcv_sx) r = 0 (0xc6b4dbd0) locked @ > /usr/src/sys/kern/uipc_sockbuf.c:148 > Physical memory: 2019 MB > Dumping 160 MB: 145 129 113 97 81 65 49 33 17 1 > > #0 doadump () at pcpu.h:246 > 246 __asm __volatile("movl %%fs:0,%0" : "=r" (td)); > (kgdb) bt > #0 doadump () at pcpu.h:246 > #1 0xc04e8bb9 in db_fncall (dummy1=-1064515926, dummy2=0, dummy3=-1, > dummy4=0xe83f4834 "HH?รจ") > at /usr/src/sys/ddb/db_command.c:548 > #2 0xc04e8fef in db_command (last_cmdp=0xc0de14dc, cmd_table=0x0, dopager=0) > at /usr/src/sys/ddb/db_command.c:445 > #3 0xc04e90a4 in db_command_script (command=0xc0de2404 "call doadump") > at /usr/src/sys/ddb/db_command.c:516 > #4 0xc04ed1d0 in db_script_exec (scriptname=0xe83f4940 "kdb.enter.panic", > warnifnotfound=Variable "warnifnotfound" is not available. > ) > at /usr/src/sys/ddb/db_script.c:302 > #5 0xc04ed2b7 in db_script_kdbenter (eventname=0xc0ca1948 "panic") at > /usr/src/sys/ddb/db_script.c:324 > #6 0xc04eaf98 in db_trap (type=3, code=0) at /usr/src/sys/ddb/db_main.c:228 > #7 0xc08cc526 in kdb_trap (type=3, code=0, tf=0xe83f4a7c) at > /usr/src/sys/kern/subr_kdb.c:535 > #8 0xc0bdd38b in trap (frame=0xe83f4a7c) at /usr/src/sys/i386/i386/trap.c:690 > #9 0xc0bbef1b in calltrap () at /usr/src/sys/i386/i386/exception.s:165 > #10 0xc08cc6aa in kdb_enter (why=0xc0ca1948 "panic", msg=0xc0ca1948 "panic") > at cpufunc.h:71 > #11 0xc089d716 in panic (fmt=0xc0c3c80c "Bad link elm %p next->prev != elm") > at /usr/src/sys/kern/kern_shutdown.c:562 > #12 0xc094e7fb in if_clone_destroyif (ifc=0xc0da1cc0, ifp=0xc5f1a800) at > /usr/src/sys/net/if_clone.c:249 > #13 0xc094eb52 in if_clone_destroy (name=0xc664ac20 "tun0") at > /usr/src/sys/net/if_clone.c:227 > #14 0xc094c8a6 in ifioctl (so=0xc6e0a9a8, cmd=2149607801, data=0xc664ac20 > "tun0", td=0xc66c0d80) > at /usr/src/sys/net/if.c:2412 > #15 0xc08e8b25 in soo_ioctl (fp=0xc6d46af0, cmd=2149607801, data=0xc664ac20, > active_cred=0xc5f62280, > td=0xc66c0d80) at /usr/src/sys/kern/sys_socket.c:212 > #16 0xc08e31bd in kern_ioctl (td=0xc66c0d80, fd=3, com=2149607801, > data=0xc664ac20 "tun0") at file.h:262 > #17 0xc08e3344 in ioctl (td=0xc66c0d80, uap=0xe83f4cf8) at > /usr/src/sys/kern/sys_generic.c:678 > #18 0xc0bdca33 in syscall (frame=0xe83f4d38) at > /usr/src/sys/i386/i386/trap.c:1078 > #19 0xc0bbefb0 in Xint0x80_syscall () at > /usr/src/sys/i386/i386/exception.s:261 > #20 0x00000033 in ?? () > Previous frame inner to this frame (corrupt stack?) > (kgdb) fr 12 > #12 0xc094e7fb in if_clone_destroyif (ifc=0xc0da1cc0, ifp=0xc5f1a800) at > /usr/src/sys/net/if_clone.c:249 > 249 IFC_IFLIST_REMOVE(ifc, ifp); > (kgdb) list > 244 * switch to the vnet context of the target vnet. > 245 */ > 246 CURVNET_SET_QUIET(ifp->if_vnet); > 247 > 248 IF_CLONE_LOCK(ifc); > 249 IFC_IFLIST_REMOVE(ifc, ifp); > 250 IF_CLONE_UNLOCK(ifc); > 251 > 252 if_delgroup(ifp, ifc->ifc_name); > 253 >
Actually, this issue has already been reported (kern/116837, see the bottom of the discussion) and there was a patch provided by Takahiro Kurosawa [check that ifp is on ifc->ifc_iflist before calling IFC_IFLIST_REMOVE(ifc, ifp)]. Although he mentioned that another race was still possible. I have tried the patch and yes it makes the situation much better: the box did not crush when running two "ifconfig tun0 create/destroy" scripts concurrently, but when I tried 8 concurrent processes :-) it crashed after a couple minutes in another place: (kgdb) bt #0 doadump () at pcpu.h:246 #1 0xc04ec379 in db_fncall (dummy1=1, dummy2=0, dummy3=-1056947200, dummy4=0xe86848e4 "") at /usr/src/sys/ddb/db_command.c:548 #2 0xc04ec771 in db_command (last_cmdp=0xc0e04d1c, cmd_table=0x0, dopager=1) at /usr/src/sys/ddb/db_command.c:445 #3 0xc04ec8ca in db_command_loop () at /usr/src/sys/ddb/db_command.c:498 #4 0xc04ee76d in db_trap (type=12, code=0) at /usr/src/sys/ddb/db_main.c:229 #5 0xc08d7d06 in kdb_trap (type=12, code=0, tf=0xe8684ad0) at /usr/src/sys/kern/subr_kdb.c:535 #6 0xc0bea66f in trap_fatal (frame=0xe8684ad0, eva=3735929054) at /usr/src/sys/i386/i386/trap.c:929 #7 0xc0beaf90 in trap (frame=0xe8684ad0) at /usr/src/sys/i386/i386/trap.c:328 #8 0xc0bccd7b in calltrap () at /usr/src/sys/i386/i386/exception.s:165 #9 0xc094bfa6 in strcmp (s1=0xc663686b "vmnet", s2=0xdeadc0de <Address 0xdeadc0de out of bounds>) at /usr/src/sys/libkern/strcmp.c:45 #10 0xc095a9c2 in if_clone_destroy (name=0xc5f7d840 "tun0") at /usr/src/sys/net/if_clone.c:209 #11 0xc09584d6 in ifioctl (so=0xc721a80c, cmd=2149607801, data=0xc5f7d840 "tun0", td=0xc731fb90) at /usr/src/sys/net/if.c:2486 #12 0xc08f4615 in soo_ioctl (fp=0xc5f1ca80, cmd=2149607801, data=0xc5f7d840, active_cred=0xc5ed4180, td=0xc731fb90) at /usr/src/sys/kern/sys_socket.c:212 #13 0xc08eec8d in kern_ioctl (td=0xc731fb90, fd=3, com=2149607801, data=0xc5f7d840 "tun0") at file.h:262 #14 0xc08eee14 in ioctl (td=0xc731fb90, uap=0xe8684cf8) at /usr/src/sys/kern/sys_generic.c:678 #15 0xc0beab40 in syscall (frame=0xe8684d38) at /usr/src/sys/i386/i386/trap.c:1111 #16 0xc0bcce10 in Xint0x80_syscall () at /usr/src/sys/i386/i386/exception.s:261 #17 0x00000033 in ?? () Previous frame inner to this frame (corrupt stack?) (kgdb) fr 10 #10 0xc095a9c2 in if_clone_destroy (name=0xc5f7d840 "tun0") at /usr/src/sys/net/if_clone.c:209 209 if (strcmp(ifc->ifc_name, ifp->if_dname) == 0) { (kgdb) list 204 return (ENXIO); 205 206 /* Find the cloner for this interface */ 207 IF_CLONERS_LOCK(); 208 LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { 209 if (strcmp(ifc->ifc_name, ifp->if_dname) == 0) { 210 break; 211 } 212 } 213 #ifdef VIMAGE (kgdb) p ifc->ifc_name $1 = 0xc663686b "vmnet" (kgdb) p ifp->if_dname $2 = 0xdeadc0de <Address 0xdeadc0de out of bounds> (kgdb) May be we can use ifunit_ref() instead of ifunit() like in the patch below to avoid this race (the patch also includes Takahiro Kurosawa's patch from kern/116837)? I was running 32 "ifconfig tun0 create/destroy" on the patched kernel through all the night and did not manage to crash the system. -- Mikolaj Golub
--- sys/net/if_clone.c.orig 2010-02-28 16:39:30.000000000 +0200 +++ sys/net/if_clone.c 2010-03-15 23:46:56.000000000 +0200 @@ -196,10 +196,11 @@ if_clone_createif(struct if_clone *ifc, int if_clone_destroy(const char *name) { + int err; struct if_clone *ifc; struct ifnet *ifp; - ifp = ifunit(name); + ifp = ifunit_ref(name); if (ifp == NULL) return (ENXIO); @@ -221,10 +222,14 @@ if_clone_destroy(const char *name) } #endif IF_CLONERS_UNLOCK(); - if (ifc == NULL) + if (ifc == NULL) { + if_rele(ifp); return (EINVAL); + } - return (if_clone_destroyif(ifc, ifp)); + err = if_clone_destroyif(ifc, ifp); + if_rele(ifp); + return err; } /* @@ -234,6 +239,7 @@ int if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) { int err; + struct ifnet *tmp; if (ifc->ifc_destroy == NULL) return(EOPNOTSUPP); @@ -246,8 +252,15 @@ if_clone_destroyif(struct if_clone *ifc, CURVNET_SET_QUIET(ifp->if_vnet); IF_CLONE_LOCK(ifc); - IFC_IFLIST_REMOVE(ifc, ifp); + LIST_FOREACH(tmp, &ifc->ifc_iflist, if_clones) { + if (tmp == ifp) { + IFC_IFLIST_REMOVE(ifc, ifp); + break; + } + } IF_CLONE_UNLOCK(ifc); + if (tmp == NULL) + return (ENXIO); /* ifp is not on the list. */ if_delgroup(ifp, ifc->ifc_name);
_______________________________________________ freebsd-net@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/freebsd-net To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"