Hi,

I have a machine that panics almost daily in route.c, in rt_check(). This panic has been reported by several users, including Marcel Moolenaar for a machine in freebsd.org.

The problem is present in both 6-STABLE and 7-CURRENT, and apparently it manifests on SMP machines, both i386 and AMD64.

The panic backtrace looks like this:

panic: mtx_lock() of destroyed mutex @ /usr/src/sys/net/route.c:1305
cpuid = 1
KDB: stack backtrace:
db_trace_self_wrapper(c091bcf0,e38b690c,c0659fc1,c093f3cf,1,...) at db_trace_self_wrapper+0x26
kdb_backtrace(c093f3cf,1,c0917de2,e38b6918,1,...) at kdb_backtrace+0x29
panic(c0917de2,c0925d40,519,0,0,...) at panic+0x111
_mtx_lock_flags(c5d333a8,0,c0925d40,519,0,...) at _mtx_lock_flags+0x59
rt_check(e38b6970,e38b698c,c55b7d10,0,0,...) at rt_check+0x11e
arpresolve(c4e27000,c5d33d98,c50dbe00,c55b7d10,e38b69a6,...) at arpresolve+0xaf ether_output(c4e27000,c50dbe00,c55b7d10,c5d33d98,ccf8b348,...) at ether_output+0x7e
ip_output(c50dbe00,0,e38b6a1c,0,0,...) at ip_output+0xa09
tcp_output(ccefbac8,0,c0929785,91d,0,...) at tcp_output+0x1463
tcp_do_segment(ccefbac8,28,0,1dd,901f,...) at tcp_do_segment+0x1c97
tcp_input(c6095100,14,c4ea3c00,1,0,...) at tcp_input+0xd5e
ip_input(c6095100,0,c09258bd,8c,c09efc38,...) at ip_input+0x662
netisr_processqueue(e38b6cc4,c064df85,c09eb940,1,c4d03480,...) at netisr_processqueue+0x98
swi_net(0,0,c0915aee,471,c4d0bd64,...) at swi_net+0xdb
ithread_loop(c4d0c270,e38b6d38,c0915862,315,c4d56558,...) at ithread_loop+0x1c5
fork_exit(c063e2d0,c4d0c270,e38b6d38) at fork_exit+0xc5
fork_trampoline() at fork_trampoline+0x8

...

#0  doadump () at pcpu.h:195
195     pcpu.h: No such file or directory.
        in pcpu.h
(kgdb) bt
#0  doadump () at pcpu.h:195
#1  0xc0659d2c in boot (howto=260) at /usr/src/sys/kern/kern_shutdown.c:409
#2  0xc0659ff0 in panic (fmt=Variable "fmt" is not available.
) at /usr/src/sys/kern/kern_shutdown.c:563
#3 0xc064e699 in _mtx_lock_flags (m=0x0, opts=0, file=0xc0925d40 "/usr/src/sys/net/route.c", line=1305)
    at /usr/src/sys/kern/kern_mutex.c:178
#4 0xc06fe28e in rt_check (lrt=0xe38b6970, lrt0=0xe38b698c, dst=0xc55b7d10) at /usr/src/sys/net/route.c:1305 #5 0xc070282f in arpresolve (ifp=0xc4e27000, rt0=0xc5d33d98, m=0xc50dbe00, dst=0xc55b7d10, desten=0xe38b69a6 "")
    at /usr/src/sys/netinet/if_ether.c:373
#6 0xc06f019e in ether_output (ifp=0xc4e27000, m=0xc50dbe00, dst=0xc55b7d10, rt0=0xc5d33d98) at /usr/src/sys/net/if_ethersubr.c:175 #7 0xc07127a9 in ip_output (m=0xc50dbe00, opt=0x0, ro=0xe38b6a1c, flags=Variable "flags" is not available.
) at /usr/src/sys/netinet/ip_output.c:547
#8 0xc076d6e3 in tcp_output (tp=0xccefbac8) at /usr/src/sys/netinet/tcp_output.c:1125 #9 0xc076ab87 in tcp_do_segment (m=0xc6095100, th=0xc6095158, so=0xccdb67bc, tp=0xccefbac8, drop_hdrlen=40, tlen=0)
    at /usr/src/sys/netinet/tcp_input.c:2345
#10 0xc076bb0e in tcp_input (m=0xc6095100, off0=20) at /usr/src/sys/netinet/tcp_input.c:843 #11 0xc0710c42 in ip_input (m=0xc6095100) at /usr/src/sys/netinet/ip_input.c:663 #12 0xc06f9148 in netisr_processqueue (ni=0xc09efc38) at /usr/src/sys/net/netisr.c:143
#13 0xc06f925b in swi_net (dummy=0x0) at /usr/src/sys/net/netisr.c:256
#14 0xc063e495 in ithread_loop (arg=0xc4d0c270) at /usr/src/sys/kern/kern_intr.c:1036 #15 0xc063b845 in fork_exit (callout=0xc063e2d0 <ithread_loop>, arg=0xc4d0c270, frame=0xe38b6d38) at /usr/src/sys/kern/kern_fork.c:797 #16 0xc0896f80 in fork_trampoline () at /usr/src/sys/i386/i386/exception.s:205

I've been trying to solve this with Craig Rodrigues, and I've tried several patches, without success. The backtrace above happens on the following code from net/route.c:

1299     /* XXX BSD/OS checks dst->sa_family != AF_NS */
1300     if (rt->rt_flags & RTF_GATEWAY) {
1301         struct rtentry *temp_rt_gwroute = rt->rt_gwroute;
1302         if (temp_rt_gwroute == NULL)
1303             goto lookup;
1304         rt = rt->rt_gwroute;
1305         RT_LOCK(rt);        /* NB: gwroute */
1306         if(rt0->rt_flags & 0x80000000U){
1307             /*This rt is under process...*/
1308             RT_UNLOCK(rt);
1309             RT_UNLOCK(rt0);
1310             goto try_again;
1311         }
1312         if ((rt->rt_flags & RTF_UP) == 0) {
1313             rt0->rt_flags |= 0x80000000U;
1314             RTFREE_LOCKED(rt);  /* unlock gwroute */
1315             rt = rt0;
1316         lookup:
1317             RT_UNLOCK(rt0);
1318             rt = rtalloc1(rt->rt_gateway, 1, 0UL);
1319             if (rt == rt0) {
1320                 rt0->rt_gwroute = NULL;
1321                 RT_REMREF(rt0);
1322                 RT_UNLOCK(rt0);
1323                 return (ENETUNREACH);
1324             }
1325             RT_LOCK(rt0);
1326             rt0->rt_gwroute = rt;
1327             rt0->rt_flags &= (~0x80000000U);
1328             if (rt == NULL) {
1329                 RT_UNLOCK(rt0);
1330                 return (EHOSTUNREACH);
1331             }
1332         }
1333         RT_UNLOCK(rt0);
1334     }

This code contains several patches we tried for workarounds, without any success. The panic is always in RT_LOCK(rt) line: sometimes it's NULL pointer reference, sometimes it's an operation on destroyed mutex.

This is a critical problem for me, but I believe it's also critical for other users.

Does anyone have more ideas about how to solve this problem?

Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to