On 19 Feb 2024, at 10:34, LIU Yulong wrote:
> Hi OVS experts,
>
> Our ovs-vswitchd runs to core at the ovs_mutex_trylock(&ukey->mutex) in the
> function revalidator_sweep__.
>
> I've sent the mail before but have no response.
> https://mail.openvswitch.org/pipermail/ovs-discuss/2023-August/052604.html
>
> So I'm trying to send this mail again. And I may apologize in advance because
> I would like to post as much useful information as possible to help identify
> potential issues. So this mail will have a really long text.
>
> Compared to the mail 2023-August/052604.html, we upgrade the OVS to 2.17.8
> and DPDK to 22.11 to pray for good luck that maybe the community has potential
> fixes for this issue. But unfortunately, the ovs-vswitchd still runs to core.
As you mentioned it looks like some memory corruption, which I have not seen
before.
Have you tried this without rte offload? This is the only feature I never used.
There is a 2.17.9 with DPDK 22.11.6 you could try.
//Eelco
> Here are some local debug information:
>
> (gdb) bt
> #0 0x00007f8751bbf337 in raise () from /lib64/libc.so.6
> #1 0x00007f8751bc0a28 in abort () from /lib64/libc.so.6
> #2 0x000055c52ed06c7e in ovs_abort_valist (err_no=<optimized out>,
> format=<optimized out>, args=args@entry=0x7f8744249370) at
> lib/util.c:499
> #3 0x000055c52ed06d14 in ovs_abort (err_no=err_no@entry=0,
> format=format@entry=0x55c52f01b1e8 "%s: %s() passed uninitialized
> ovs_mutex") at lib/util.c:491
> #4 0x000055c52ecd17e1 in ovs_mutex_trylock_at
> (l_=l_@entry=0x7f8718dcc098, where=where@entry=0x55c52eff5c60
> "ofproto/ofproto-dpif-upcall.c:3044") at lib/ovs-thread.c:106
> #5 0x000055c52ebf25f9 in revalidator_sweep__
> (revalidator=revalidator@entry=0x55c533082c70,
> purge=purge@entry=false) at ofproto/ofproto-dpif-upcall.c:3044
> #6 0x000055c52ebf640f in revalidator_sweep
> (revalidator=0x55c533082c70) at ofproto/ofproto-dpif-upcall.c:3102
> #7 udpif_revalidator (arg=0x55c533082c70) at
> ofproto/ofproto-dpif-upcall.c:1101
> #8 0x000055c52ecd239f in ovsthread_wrapper (aux_=<optimized out>) at
> lib/ovs-thread.c:422
> #9 0x00007f8753d16e65 in start_thread () from /lib64/libpthread.so.0
> #10 0x00007f8751c8788d in clone () from /lib64/libc.so.6
>
> bt output with pretty print
> (gdb) bt full
> #0 0x00007f8751bbf337 in raise () from /lib64/libc.so.6
> No symbol table info available.
> #1 0x00007f8751bc0a28 in abort () from /lib64/libc.so.6
> No symbol table info available.
> #2 0x000055c52ed06c7e in ovs_abort_valist (err_no=<optimized out>,
> format=<optimized out>, args=args@entry=0x7f8744249370) at
> lib/util.c:499
> No locals.
> #3 0x000055c52ed06d14 in ovs_abort (err_no=err_no@entry=0,
> format=format@entry=0x55c52f01b1e8 "%s: %s() passed uninitialized
> ovs_mutex") at lib/util.c:491
> args = {{
> gp_offset = 32,
> fp_offset = 48,
> overflow_arg_area = 0x7f8744249450,
> reg_save_area = 0x7f8744249390
> }}
> #4 0x000055c52ecd17e1 in ovs_mutex_trylock_at
> (l_=l_@entry=0x7f8718dcc098, where=where@entry=0x55c52eff5c60
> "ofproto/ofproto-dpif-upcall.c:3044") at lib/ovs-thread.c:106
> l = 0x7f8718dcc098
> error = <optimized out>
> __func__ = "ovs_mutex_trylock_at"
> #5 0x000055c52ebf25f9 in revalidator_sweep__
> (revalidator=revalidator@entry=0x55c533082c70,
> purge=purge@entry=false) at ofproto/ofproto-dpif-upcall.c:3044
> ukey_state = <optimized out>
> cursor_52 = {
> impl = 0x7f86f826c8c0,
> bucket_idx = 2,
> entry_idx = 3,
> node = 0x7f86f8dc8020
> }
> odp_actions_stub = {140218217949376, 9951266880679575560,
> 55834640392, 10323741882, 0, 2, 140218217949592, 168, 140218217949416,
> 12885426177, 140218940560728, 281510948569217, 1125934266581005,
> 281509421383809,
> 1099511627785, 0, 140218226622112, 4, 819, 10323556447, 0,
> 2, 140218226622328, 148, 140218226622152, 12885753857,
> 140218940560856, 2984229694, 18446744069414584320,
> 18446462603027808255, 4294967295, 1407392063422464,
> 140218257906000, 3, 248, 10322875848, 0, 2, 140218257906216,
> 168, 140218257906040, 12884901889, 140218940560984, 0, 0, 0, 0, 0,
> 140218228630736, 5, 1088, 10292016731, 24, 2, 140218228630952, 168,
> 140218228630776,
> 12884901889, 140218940561112, 0, 0, 0, 0, 0,
> 140218255617104, 2, 120, 10124105851, 0, 2, 140218255617320, 148,
> 140218255617144, 12884901889, 140218940561240, 0, 0, 0, 0, 0,
> 140218243823504, 0, 0, 0, 0, 2,
> 140218243823720, 140, 140218243823544, 12884901889,
> 140218940561368, 0 <repeats 37 times>}
> odp_actions = {
> base = 0x7f8744249550,
> data = 0x7f8744249550,
> size = 0,
> allocated = 1024,
> header = 0x0,
> msg = 0x0,
> list_node = {
> prev = 0xcccccccccccccccc,
> next = 0xcccccccccccccccc
> },
> source = OFPBUF_STUB
> }
> ukey = 0x7f8718dcc050
> n_ops = 0
> ops = {
> ... a really long list ....
> {
> ukey = 0x0,
> stats = {
>
> n_packets = 0,
> n_bytes = 0,
> used = 0,
> tcp_flags = 0
> },
> dop = {
> type = 0,
> error = 0,
> {
> flow_put = {
> flags = (DPIF_FP_CREATE | unknown: 905450480),
> key = 0x50178a28944953ad,
> key_len = 12884901889,
> mask = 0x0,
> mask_len = 0,
> actions = 0x0,
> actions_len = 0,
> ufid = 0x1,
> pmd_id = 788417076,
> stats = 0x55c52ecd15f8 <ovs_mutex_lock_at+24>
> },
> flow_del = {
> key = 0x2694413835f813f1,
> key_len = 5771233354389738413,
> ufid = 0x300000001,
> terse = false,
> pmd_id = 0,
> stats = 0x0
> },
> execute = {
> actions = 0x2694413835f813f1,
> actions_len = 5771233354389738413,
> needs_help = true,
> probe = false,
> mtu = 3,
> hash = 0,
> flow = 0x0,
> packet = 0x0
> },
> flow_get = {
> key = 0x2694413835f813f1,
> key_len = 5771233354389738413,
> ufid = 0x300000001,
> pmd_id = 0,
> buffer = 0x0,
> flow = 0x0
> }
> }
> }
> }
> ... a really long list ....
> }
> umap = 0x55c53301f998
> cur = <optimized out>
> i = 39
> udpif = 0x55c53301ee20
> dump_seq = 3090869337
> reval_seq = 3090869356
> slice = <optimized out>
> __func__ = "revalidator_sweep__"
> #6 0x000055c52ebf640f in revalidator_sweep
> (revalidator=0x55c533082c70) at ofproto/ofproto-dpif-upcall.c:3102
>
> No locals.
> #7 udpif_revalidator (arg=0x55c533082c70) at
> ofproto/ofproto-dpif-upcall.c:1101
> revalidator = 0x55c533082c70
> udpif = 0x55c53301ee20
> leader = true
> start_time = 10324370685
> last_reval_seq = 3090867551
> n_flows = 14393
> #8 0x000055c52ecd239f in ovsthread_wrapper (aux_=<optimized out>) at
> lib/ovs-thread.c:422
> auxp = <optimized out>
> aux = {
> start = 0x55c52ebf6350 <udpif_revalidator>,
> arg = 0x55c533082c70,
> name = "revalidator\000\000\000\000"
> }
> id = 7
> subprogram_name = 0x7f87280008c0 "pN\314(\207\177"
> #9 0x00007f8753d16e65 in start_thread () from /lib64/libpthread.so.0
> No symbol table info available.
> #10 0x00007f8751c8788d in clone () from /lib64/libc.so.6
> No symbol table info available.
>
> The umap of loop iteration udpif->ukeys[39] has ukeys (output with
> ovs_dump_udpif_keys):
>
> (struct umap *) 0x55c53301f998:
> (struct udpif_key *) 0x7f86f8df3930: key_len = 148, mask_len = 152
> ufid =
> a18d9eac-9718-21db-7f97-4a287638e2ef
> hash = 0x7b0c4227, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 1, n_bytes = 115
> used = 10324368860, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871a6db4e0: key_len = 140, mask_len = 152
> ufid =
> 4e04d989-8729-49fc-1d0a-46ef2449ef75
> hash = 0x2fe84627, pmd_id = 3
> state = UKEY_VISIBLE
> state_where = 0x55c52eff5da8
> "ofproto/ofproto-dpif-upcall.c:2036"
> n_packets = 0, n_bytes = 0
> used = 0, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f8719909d50: key_len = 160, mask_len = 172
> ufid =
> aa5170a7-818e-c902-44bf-4f60ff18f0f7
> hash = 0x f037027, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 1, n_bytes = 66
> used = 10324370390, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871a5af4c0: key_len = 168, mask_len = 172
> ufid =
> f9773957-96f7-8c4e-7ba3-46cf73f7b3e0
> hash = 0x32da0a27, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 5, n_bytes = 1243
> used = 10324368587, tcp_flags = 0x0018
> (struct udpif_key *) 0x7f8718834ff0: key_len = 148, mask_len = 152
> ufid =
> 0d733731-a642-a60f-4f4f-4c6425dd398e
> hash = 0x61e7a027, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 2, n_bytes = 120
> used = 10324367306, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871aafb0c0: key_len = 168, mask_len = 172
> ufid =
> c9a3ab02-8f78-44ee-6583-45e778fa98a5
> hash = 0x3e3fbc27, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 1, n_bytes = 60
> used = 10324370056, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f8718c5d660: key_len = 160, mask_len = 172
> ufid =
> 01a3b747-909a-8077-c2d1-4e3542235fd6
> hash = 0x7cff9c27, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 8, n_bytes = 492
> used = 10324368705, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f8718d04340: key_len = 168, mask_len = 172
> ufid =
> d3965df7-bb19-f997-e736-41e7c701b5f7
> hash = 0x8db4d027, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 7, n_bytes = 3781
> used = 10324367542, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871aa85860: key_len = 160, mask_len = 172
> ufid =
> 44c1e2ec-b297-fa54-851c-41bf0c255865
> hash = 0xec913027, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 1, n_bytes = 66
> used = 10324367277, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871a260390: key_len = 168, mask_len = 172
> ufid =
> b6c15cd7-907d-8695-a503-4253a3cb0b7a
> hash = 0xd35a2c27, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 7, n_bytes = 462
> used = 10324367339, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871bfae340: key_len = 140, mask_len = 152
> ufid =
> 84068184-aed0-3d90-5923-4d6f53fc309c
> hash = 0xaa917227, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 5, n_bytes = 322
> used = 10324369800, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f8718d91730: key_len = 148, mask_len = 152
> ufid =
> 70833d4f-afe7-4c55-6267-4458ebddf3fa
> hash = 0x7ad61a27, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 1, n_bytes = 140
> used = 10324365702, tcp_flags = 0x0018
> (struct udpif_key *) 0x7f871879ad50: key_len = 168, mask_len = 172
> ufid =
> d6b2900e-adbe-02b5-e7d5-48f216c1710c
> hash = 0xde1a0e27, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 0, n_bytes = 0
> used = 10324369841, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871a061ec0: key_len = 160, mask_len = 172
> ufid =
> 47dea4f7-9248-e065-2764-438e3d727cd0
> hash = 0x55b18227, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 0, n_bytes = 0
> used = 10324369527, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871ab964b0: key_len = 168, mask_len = 172
> ufid =
> 286cf642-9ca0-8b63-f6ad-443081df74e2
> hash = 0x8d81d627, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 187, n_bytes = 41849
> used = 10324366880, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f86f93190e0: key_len = 168, mask_len = 172
> ufid =
> 9de8567b-b3f2-f4e9-cc00-4279bc9be36a
> hash = 0xfb47e827, pmd_id = 3
> state = UKEY_VISIBLE
> state_where = 0x55c52eff5e18
> "ofproto/ofproto-dpif-upcall.c:2089"
> n_packets = 0, n_bytes = 0
> used = 0, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f86f8feceb0: key_len = 168, mask_len = 172
> ufid =
> 3f87d43a-8835-1833-1d24-4dbe0213ace2
> hash = 0x 7274627, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 0, n_bytes = 0
> used = 10324370446, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871a74abc0: key_len = 148, mask_len = 152
> ufid =
> d33fdfff-bed2-5328-2f23-4d76ec8b406e
> hash = 0xad0aba27, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 1149, n_bytes = 73127
> used = 10324370702, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871966d0d0: key_len = 168, mask_len = 172
> ufid =
> b410e4d7-86cb-f77e-e4ff-4f7e42c2c28a
> hash = 0xc5ac4227, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 0, n_bytes = 0
> used = 10324369984, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871b49ddb0: key_len = 168, mask_len = 172
> ufid =
> c9fd9fa3-b621-d37b-507c-4e22b318ce81
> hash = 0xa0cdd627, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 9, n_bytes = 2957
> used = 10324370136, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871a9a7940: key_len = 160, mask_len = 172
> ufid =
> 4c0b3bce-8bde-4621-6393-4935fe209d85
> hash = 0x d0d6e27, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 3, n_bytes = 206
> used = 10324370325, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f8719fae470: key_len = 148, mask_len = 152
> ufid =
> a6a3c8f2-8de5-14eb-110b-4ed2bf989a23
> hash = 0xeda41827, pmd_id = 3
> state = UKEY_OPERATIONAL
> state_where = 0x55c52eff6358
> "ofproto/ofproto-dpif-upcall.c:2957"
> n_packets = 43, n_bytes = 31311
> used = 10324368839, tcp_flags = 0x0018
> (struct udpif_key *) 0x7f86f8d1adb0: key_len = 148, mask_len = 152
> ufid =
> 80b9fe38-8aba-d4b3-c119-410418fe1092
> hash = 0x137d4027, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 2, n_bytes = 124
> used = 10324367255, tcp_flags = 0x0000
> (struct udpif_key *) 0x7f871ba370d0: key_len = 168, mask_len = 172
> ufid =
> 7e5c26fd-af10-ba15-653c-454a828c068d
> hash = 0x9306ba27, pmd_id = 3
> state = UKEY_EVICTED
> state_where = 0x55c52eff5b48
> "ofproto/ofproto-dpif-upcall.c:2608"
> n_packets = 5, n_bytes = 820
> used = 10324368550, tcp_flags = 0x0000
> The length is 24.
>
> The umap details:
> (gdb) p *(struct umap *) 0x55c53301f998
> $12 = {
> mutex = {
> lock = {
> __data = {
> __lock = 0,
> __count = 0,
> __owner = 0,
> __nusers = 0,
> __kind = 2,
> __spins = 0,
> __elision = 0,
> __list = {
> __prev = 0x0,
> __next = 0x0
> }
> },
> __size = '\000' <repeats 16 times>, "\002", '\000' <repeats 22 times>,
> __align = 0
> },
> where = 0x55c52efef4be "<unlocked>"
> },
> cmap = {
> impl = {
> p = 0x7f86f826c8c0
> }
> }
> }
>
>
> As we can see the umap 0x55c53301f998 does not have a ukey 0x7f8718dcc050
> (but bt full output has ukey = 0x7f8718dcc050). And this ukey =
> 0x7f8718dcc050 indeed
> has a mutex with an uninitialized 'where' pointer. Maybe this pointer
> is just invalid.
>
> (gdb) p *(struct udpif_key *)0x7f8718dcc050
> $11 = {
> ...
> mutex = {
> lock = {
> __data = {
> __lock = 0,
> __count = 0,
> __owner = 0,
> __nusers = 0,
> __kind = -1,
> __spins = 0,
> __elision = 0,
> __list = {
> __prev = 0x0,
> __next = 0x0
> }
> },
> __size = '\000' <repeats 16 times>, "\377\377\377\377", '\000'
> <repeats 19 times>,
> __align = 0
> },
> where = 0x0
> },
> ...
> }
>
> There seems to be an out-of-bounds access to the linked list of ukeys here.
>
> So, I would greatly appreciate your help, as it is crucial for OVS to operate
> in our production environment.
>
> I can provide further debug related output information at any time.
> Waiting for your response...
> Thank you very much in advance.
>
> Best regards,
> LIU Yulong
_______________________________________________
discuss mailing list
disc...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-discuss