Hi there, We met some ovs-vswitchd coredumps of our running cloud recently. And it has been dead many times.
The ovs version is 2.17.2. The ovs is running with dpdk 20.11. The dpdk rte offload is enabled with Mellanox CX6-Dx. The core dump trace is: #0 0x00007f3af0bc0337 in __bsd_signal (sig=23855, handler=0x5d5a) at ../sysdeps/posix/signal.c:50 #1 0x00007f3af0bc1a28 in __GI_abort () at abort.c:79 #2 0x000055cce5da44ee in ovs_abort_valist (err_no=<optimized out>, format=<optimized out>, args=args@entry=0x7f3ae2981360) at lib/util.c:499 #3 0x000055cce5da4584 in ovs_abort (err_no=err_no@entry=0, format=format@entry=0x55cce6042d18 "%s: %s() passed uninitialized ovs_mutex") at lib/util.c:491 #4 0x000055cce5d6f4a1 in ovs_mutex_trylock_at (l_=l_@entry=0x7f3aac156ec8, where=where@entry=0x55cce6020318 "ofproto/ofproto-dpif-upcall.c:3014") at lib/ovs-thread.c:106 #5 0x000055cce5c98181 in revalidator_sweep__ (revalidator=revalidator@entry=0x55cce9595780, purge=purge@entry=false) at ofproto/ofproto-dpif-upcall.c:3014 #6 0x000055cce5c9c1a6 in revalidator_sweep (revalidator=0x55cce9595780) at ofproto/ofproto-dpif-upcall.c:3072 #7 udpif_revalidator (arg=0x55cce9595780) at ofproto/ofproto-dpif-upcall.c:1086 #8 0x000055cce5d7005f in ovsthread_wrapper (aux_=<optimized out>) at lib/ovs-thread.c:422 #9 0x00007f3af2afee65 in start_thread (arg=0x0) at pthread_create.c:282 #10 0x00007f3af0c8888d in __libc_ifunc_impl_list (name=<optimized out>, array=0x7f3ae2986700, max=<optimized out>) at ../sysdeps/x86_64/multiarch/ifunc-impl-list.c:329 #11 0x0000000000000000 in ?? () (gdb) info threads Id Target Id Frame 22 LWP 23896 0x00007f3af2b05e5d in msync () at ../sysdeps/unix/syscall-template.S:83 21 LWP 104165 0x00007f3af2b0571d in write () at ../sysdeps/unix/syscall-template.S:83 20 LWP 23897 0x00007f3af0c7dbed in fts_read (sp=0x7f3ac40008c0) at fts.c:459 19 LWP 23886 0x00007f3af0c7dbed in fts_read (sp=0x7f3ad80008c0) at fts.c:459 18 LWP 23885 0x00007f3af0c7dbed in fts_read (sp=0x7f3acc0008c0) at fts.c:459 17 LWP 23873 0x00007f3af0c4f80d in __sigaddset (__sig=17, __set=0x7f3ae3e52100) at ../sysdeps/unix/sysv/linux/bits/sigset.h:118 16 Thread 0x7f3af315c000 (LWP 23855) 0x00007f3af0c7dbed in fts_read (sp=0x55cce966db40) at fts.c:459 15 LWP 8050 0x00007f3af2b02da2 in pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:193 14 LWP 104178 0x00007f3af2b0571d in write () at ../sysdeps/unix/syscall-template.S:83 13 LWP 104177 0x00007f3af2b0571d in write () at ../sysdeps/unix/syscall-template.S:83 12 LWP 23904 0x00007f3af0c7dbed in fts_read (sp=0x7f3aa4004ae0) at fts.c:459 11 LWP 104171 0x00007f3af2b0571d in write () at ../sysdeps/unix/syscall-template.S:83 10 LWP 103766 0x00007f3af2b0571d in write () at ../sysdeps/unix/syscall-template.S:83 9 LWP 23872 0x00007f3af2b0599d in do_fcntl (arg=<optimized out>, cmd=7, fd=21964) at ../sysdeps/unix/sysv/linux/fcntl.c:39 8 LWP 67365 0x00007f3af2b0571d in write () at ../sysdeps/unix/syscall-template.S:83 7 LWP 23857 0x00007f3af2b05b6d in recvfrom () at ../sysdeps/unix/syscall-template.S:81 6 LWP 23876 0x00007f3af0c7dbed in fts_read (sp=0x7f3ad40008c0) at fts.c:459 5 LWP 23856 0x00007f3af0c88e63 in arch_prctl () at ../sysdeps/unix/syscall-template.S:81 4 LWP 23905 0x00007f3af2b056bd in vfork () at ../sysdeps/unix/sysv/linux/x86_64/vfork.S:57 3 LWP 67062 0x00007f3af0c7dbed in fts_read (sp=0x0) at fts.c:459 2 LWP 67061 0x00007f3af0c4f80d in __sigaddset (__sig=17, __set=0x7f3ae031c160) at ../sysdeps/unix/sysv/linux/bits/sigset.h:118 * 1 LWP 23898 0x00007f3af0bc0337 in __bsd_signal (sig=23855, handler=0x5d5a) at ../sysdeps/posix/signal.c:50 And gdb with the core file we have: (gdb) print $22->mutex $24 = {lock = {__data = {__lock = -866881024, __count = 2697380352, __owner = 1100469760, __nusers = 2830690816, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\000rT\314\000\302Ơ\000֗A\000긨", '\000' <repeats 23 times>, __align = -6861583673568497152}, where = 0x0} (gdb) print &$22->mutex $25 = (struct ovs_mutex *) 0x7f3aae6df548 (gdb) print *0x7f3aac156ec8 $26 = 0 (gdb) print (struct ovs_mutex *)0x7f3aac156ec8 $27 = (struct ovs_mutex *) 0x7f3aac156ec8 (gdb) print $27->where $28 = 0x0 After some code search, we can ensure that the mutex is initialized. But, seems the mutex is deleted/replaced/released during the revalidator sweep phrase. We found some patches and discussions may related to this: https://mail.openvswitch.org/pipermail/ovs-dev/2016-August/322128.html https://mail.openvswitch.org/pipermail/ovs-dev/2016-August/322125.html May I ask if everyone can provide a preliminary evaluation of the problem? How could this happen? And what can we do to work around the problem? Regards, LIU Yulong _______________________________________________ discuss mailing list disc...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-discuss