Hi, we are using OVS 2.17.9 and have been observing deadlocks for some time, which manifest themselves with the following messages in ovs-vswitch.log:
2024-10-22T08:15:10.386Z|00004|ovs_rcu(urcu3)|WARN|blocked 1000 ms waiting for pmd-c05/id:12 to quiesce 2024-10-22T08:15:11.386Z|00005|ovs_rcu(urcu3)|WARN|blocked 2000 ms waiting for pmd-c05/id:12 to quiesce 2024-10-22T08:15:13.386Z|00006|ovs_rcu(urcu3)|WARN|blocked 4000 ms waiting for pmd-c05/id:12 to quiesce 2024-10-22T08:15:17.386Z|00007|ovs_rcu(urcu3)|WARN|blocked 8000 ms waiting for pmd-c05/id:12 to quiesce 2024-10-22T08:15:25.386Z|00008|ovs_rcu(urcu3)|WARN|blocked 16000 ms waiting for pmd-c05/id:12 to quiesce 2024-10-22T08:15:41.386Z|00009|ovs_rcu(urcu3)|WARN|blocked 32000 ms waiting for pmd-c05/id:12 to quiesce In this situation, the switch fails completely and must be restarted to resolve the error condition. A backtrace shows that a PMD thread is hanging in a lock that was initiated by the conn_expired function: Thread 24 (Thread 0x7f779447cc40 (LWP 2956154) "urcu3"): #0 __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x7f779447aed0, op=393, expected=0, futex_word=0x7f77c7bc2ba8 <__aio_new_request_notification+40>) at ./nptl/futex-internal.c:57 #1 __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x7f779447aed0, clockid=0, expected=0, futex_word=0x7f77c7bc2ba8 <__aio_new_request_notification+40>) at ./nptl/futex-internal.c:87 #2 __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x7f77c7bc2ba8 <__aio_new_request_notification+40>, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x7f779447aed0, private=private@entry=0) at ./nptl/futex-internal.c:139 #3 0x00007f77c7a35e9b in __pthread_cond_wait_common (abstime=0x7f779447aed0, clockid=0, mutex=0x7f77c7bbc2c0 <__aio_requests_mutex>, cond=0x7f77c7bc2b80 <__aio_new_request_notification>) at ./nptl/pthread_cond_wait.c:503 #4 ___pthread_cond_timedwait64 (cond=cond@entry=0x7f77c7bc2b80 <__aio_new_request_notification>, mutex=mutex@entry=0x7f77c7bbc2c0 <__aio_requests_mutex>, abstime=abstime@entry=0x7f779447aed0) at ./nptl/pthread_cond_wait.c:652 #5 0x00007f77c7a40288 in handle_fildes_io (arg=<optimized out>) at ./rt/aio_misc.c:641 #6 0x00007f77c7a36ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442 #7 0x00007f77c7ac8850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 ... Thread 19 (Thread 0x7fe1c04d8640 (LWP 7538) "pmd-c05/id:12"): #0 futex_wait (private=0, expected=2, futex_word=0x7fe1d07c4790) at ../sysdeps/nptl/futex-internal.h:146 #1 __GI___lll_lock_wait (futex=futex@entry=0x7fe1d07c4790, private=0) at ./nptl/lowlevellock.c:49 #2 0x00007fe212c1e002 in lll_mutex_lock_optimized (mutex=0x7fe1d07c4790) at ./nptl/pthread_mutex_lock.c:48 #3 ___pthread_mutex_lock (mutex=mutex@entry=0x7fe1d07c4790) at ./nptl/pthread_mutex_lock.c:93 #4 0x000055b41347c86d in ovs_mutex_lock_at (l_=0x7fe1d07c4790, where=0x55b413544f32 "../lib/conntrack.c:2372") at ../lib/ovs-thread.c:75 #5 0x000055b4134ec929 in conn_expired (now=330827509, conn=0x7fe1d07c46c0) at ../lib/conntrack.c:2372 #6 conn_key_lookup (ct=<optimized out>, key=0x7fe1c04d1770, hash=<optimized out>, now=330827509, conn_out=0x7fe1c04d17a0, reply=0x7fe1c04d17ac) at ../lib/conntrack.c:492 #7 0x000055b41340f40b in initial_conn_lookup (natted=false, now=<optimized out>, ctx=0x7fe1c04d1770, ct=0x55b418225850) at ../lib/conntrack.c:1158 #8 process_one (tp_id=0, helper=0x0, tp_dst=<optimized out>, tp_src=<optimized out>, nat_action_info=0x0, setlabel=0x0, setmark=0x0, now=<optimized out>, commit=false, force=false, zone=83, ctx=0x7fe1c04d1770, pkt=0x4984b2e40, ct=0x55b418225850) at ../lib/conntrack.c:1192 #9 conntrack_execute (tp_id=<optimized out>, now=<optimized out>, nat_action_info=<optimized out>, helper=<optimized out>, tp_dst=<optimized out>, tp_src=<optimized out>, setlabel=<optimized out>, setmark=<optimized out>, zone=<optimized out>, commit=<optimized out>, force=<optimized out>, dl_type=<optimized out>, pkt_batch=0x55b413410779 <dp_execute_cb+8345>, ct=<optimized out>) at ../lib/conntrack.c:1309 #10 dp_execute_cb (aux_=aux_@entry=0x7fe1c04d2500, packets_=packets_@entry=0x7fe1c04d25a0, a=a@entry=0x7fe1c04d28d8, should_steal=should_steal@entry=false) at ../lib/dpif-netdev.c:9095 #11 0x000055b41343ac54 in odp_execute_actions (dp=<optimized out>, batch=0x7fe1c04d25a0, steal=<optimized out>, actions=<optimized out>, actions_len=<optimized out>, dp_execute_action=<optimized out>) at ../lib/odp-execute.c:866 #12 0x000055b41340b6bf in dp_netdev_execute_actions (actions_len=<optimized out>, actions=<optimized out>, flow=0x7fe1c04d2ab0, should_steal=true, packets=0x7fe1c04d25a0, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:9141 #13 handle_packet_upcall (put_actions=0x7fe1c04d2510, actions=0x7fe1c04d2550, key=0x7fe1c04d39c0, packet=<optimized out>, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:8329 #14 fast_path_processing (pmd=pmd@entry=0x7fe1c04d9010, packets_=packets_@entry=0x7fe1c04d4e50, keys=keys@entry=0x7fe1c04d3990, flow_map=flow_map@entry=0x7fe1c04d3850, index_map=index_map@entry=0x7fe1c04d3840 "", in_port=<optimized out>) at ../lib/dpif-netdev.c:8425 #15 0x000055b41340df60 in dp_netdev_input__ (pmd=pmd@entry=0x7fe1c04d9010, packets=packets@entry=0x7fe1c04d4e50, md_is_valid=md_is_valid@entry=true, port_no=port_no@entry=0) at ../lib/dpif-netdev.c:8514 #16 0x000055b41340fdb1 in dp_netdev_recirculate (packets=0x7fe1c04d4e50, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:8560 #17 dp_execute_cb (aux_=aux_@entry=0x7fe1c04d4db0, packets_=<optimized out>, packets_@entry=0x7fe1c04d4e50, a=a@entry=0x7fe1c04d5174, should_steal=should_steal@entry=true) at ../lib/dpif-netdev.c:8955 #18 0x000055b41343ac54 in odp_execute_actions (dp=<optimized out>, batch=0x7fe1c04d4e50, steal=<optimized out>, actions=<optimized out>, actions_len=<optimized out>, dp_execute_action=<optimized out>) at ../lib/odp-execute.c:866 #19 0x000055b41340b6bf in dp_netdev_execute_actions (actions_len=<optimized out>, actions=<optimized out>, flow=0x7fe1c04d5360, should_steal=true, packets=0x7fe1c04d4e50, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:9141 #20 handle_packet_upcall (put_actions=0x7fe1c04d4dc0, actions=0x7fe1c04d4e00, key=0x7fe1c04d6280, packet=<optimized out>, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:8329 #21 fast_path_processing (pmd=pmd@entry=0x7fe1c04d9010, packets_=packets_@entry=0x7fe1c04d66a0, keys=keys@entry=0x7fe1c04d6240, flow_map=flow_map@entry=0x7fe1c04d6100, index_map=index_map@entry=0x7fe1c04d60f0 "", in_port=<optimized out>) at ../lib/dpif-netdev.c:8425 #22 0x000055b41340df60 in dp_netdev_input__ (pmd=<optimized out>, packets=<optimized out>, md_is_valid=md_is_valid@entry=false, port_no=<optimized out>) at ../lib/dpif-netdev.c:8514 #23 0x000055b413411da1 in dp_netdev_input (pmd=<optimized out>, packets=<optimized out>, port_no=<optimized out>) at ../lib/dpif-netdev.c:8552 #24 0x000055b413405487 in dp_netdev_process_rxq_port (pmd=0x7fe1c04d9010, rxq=0x55b418870e60, port_no=28) at ../lib/dpif-netdev.c:5384 #25 0x000055b4134134aa in pmd_thread_main (f_=<optimized out>) at ../lib/dpif-netdev.c:6992 #26 0x000055b41347f523 in ovsthread_wrapper (aux_=<optimized out>) at ../lib/ovs-thread.c:422 #27 0x00007fe212c1aac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442 #28 0x00007fe212cac850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 Maybe this commit (https://github.com/openvswitch/ovs/commit/78387e88bdcf4704b88584405f66eaeb90103139) already solves the problem? If so, would it be possible to backport this commit to 2.17? Thanks for any feedback Michael
_______________________________________________ discuss mailing list disc...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-discuss