Hi,

we are using OVS 2.17.9 and have been observing deadlocks for some time, which 
manifest themselves with the following messages in ovs-vswitch.log:

2024-10-22T08:15:10.386Z|00004|ovs_rcu(urcu3)|WARN|blocked 1000 ms waiting for 
pmd-c05/id:12 to quiesce
2024-10-22T08:15:11.386Z|00005|ovs_rcu(urcu3)|WARN|blocked 2000 ms waiting for 
pmd-c05/id:12 to quiesce
2024-10-22T08:15:13.386Z|00006|ovs_rcu(urcu3)|WARN|blocked 4000 ms waiting for 
pmd-c05/id:12 to quiesce
2024-10-22T08:15:17.386Z|00007|ovs_rcu(urcu3)|WARN|blocked 8000 ms waiting for 
pmd-c05/id:12 to quiesce
2024-10-22T08:15:25.386Z|00008|ovs_rcu(urcu3)|WARN|blocked 16000 ms waiting for 
pmd-c05/id:12 to quiesce
2024-10-22T08:15:41.386Z|00009|ovs_rcu(urcu3)|WARN|blocked 32000 ms waiting for 
pmd-c05/id:12 to quiesce

In this situation, the switch fails completely and must be restarted to resolve 
the error condition.

A backtrace shows that a PMD thread is hanging in a lock that was initiated by 
the conn_expired function:

Thread 24 (Thread 0x7f779447cc40 (LWP 2956154) "urcu3"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, 
abstime=0x7f779447aed0, op=393, expected=0, futex_word=0x7f77c7bc2ba8 
<__aio_new_request_notification+40>) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, 
abstime=0x7f779447aed0, clockid=0, expected=0, futex_word=0x7f77c7bc2ba8 
<__aio_new_request_notification+40>) at ./nptl/futex-internal.c:87
#2  __GI___futex_abstimed_wait_cancelable64 
(futex_word=futex_word@entry=0x7f77c7bc2ba8 
<__aio_new_request_notification+40>, expected=expected@entry=0, 
clockid=clockid@entry=0, abstime=abstime@entry=0x7f779447aed0, 
private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007f77c7a35e9b in __pthread_cond_wait_common (abstime=0x7f779447aed0, 
clockid=0, mutex=0x7f77c7bbc2c0 <__aio_requests_mutex>, cond=0x7f77c7bc2b80 
<__aio_new_request_notification>) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_timedwait64 (cond=cond@entry=0x7f77c7bc2b80 
<__aio_new_request_notification>, mutex=mutex@entry=0x7f77c7bbc2c0 
<__aio_requests_mutex>, abstime=abstime@entry=0x7f779447aed0) at 
./nptl/pthread_cond_wait.c:652
#5  0x00007f77c7a40288 in handle_fildes_io (arg=<optimized out>) at 
./rt/aio_misc.c:641
#6  0x00007f77c7a36ac3 in start_thread (arg=<optimized out>) at 
./nptl/pthread_create.c:442
#7  0x00007f77c7ac8850 in clone3 () at 
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
...
Thread 19 (Thread 0x7fe1c04d8640 (LWP 7538) "pmd-c05/id:12"):
#0  futex_wait (private=0, expected=2, futex_word=0x7fe1d07c4790) at 
../sysdeps/nptl/futex-internal.h:146
#1  __GI___lll_lock_wait (futex=futex@entry=0x7fe1d07c4790, private=0) at 
./nptl/lowlevellock.c:49
#2  0x00007fe212c1e002 in lll_mutex_lock_optimized (mutex=0x7fe1d07c4790) at 
./nptl/pthread_mutex_lock.c:48
#3  ___pthread_mutex_lock (mutex=mutex@entry=0x7fe1d07c4790) at 
./nptl/pthread_mutex_lock.c:93
#4  0x000055b41347c86d in ovs_mutex_lock_at (l_=0x7fe1d07c4790, 
where=0x55b413544f32 "../lib/conntrack.c:2372") at ../lib/ovs-thread.c:75
#5  0x000055b4134ec929 in conn_expired (now=330827509, conn=0x7fe1d07c46c0) at 
../lib/conntrack.c:2372
#6  conn_key_lookup (ct=<optimized out>, key=0x7fe1c04d1770, hash=<optimized 
out>, now=330827509, conn_out=0x7fe1c04d17a0, reply=0x7fe1c04d17ac) at 
../lib/conntrack.c:492
#7  0x000055b41340f40b in initial_conn_lookup (natted=false, now=<optimized 
out>, ctx=0x7fe1c04d1770, ct=0x55b418225850) at ../lib/conntrack.c:1158
#8  process_one (tp_id=0, helper=0x0, tp_dst=<optimized out>, tp_src=<optimized 
out>, nat_action_info=0x0, setlabel=0x0, setmark=0x0, now=<optimized out>, 
commit=false, force=false, zone=83, ctx=0x7fe1c04d1770, pkt=0x4984b2e40, 
ct=0x55b418225850) at ../lib/conntrack.c:1192
#9  conntrack_execute (tp_id=<optimized out>, now=<optimized out>, 
nat_action_info=<optimized out>, helper=<optimized out>, tp_dst=<optimized 
out>, tp_src=<optimized out>, setlabel=<optimized out>, setmark=<optimized 
out>, zone=<optimized out>, commit=<optimized out>, force=<optimized out>, 
dl_type=<optimized out>, pkt_batch=0x55b413410779 <dp_execute_cb+8345>, 
ct=<optimized out>) at ../lib/conntrack.c:1309
#10 dp_execute_cb (aux_=aux_@entry=0x7fe1c04d2500, 
packets_=packets_@entry=0x7fe1c04d25a0, a=a@entry=0x7fe1c04d28d8, 
should_steal=should_steal@entry=false) at ../lib/dpif-netdev.c:9095
#11 0x000055b41343ac54 in odp_execute_actions (dp=<optimized out>, 
batch=0x7fe1c04d25a0, steal=<optimized out>, actions=<optimized out>, 
actions_len=<optimized out>, dp_execute_action=<optimized out>) at 
../lib/odp-execute.c:866
#12 0x000055b41340b6bf in dp_netdev_execute_actions (actions_len=<optimized 
out>, actions=<optimized out>, flow=0x7fe1c04d2ab0, should_steal=true, 
packets=0x7fe1c04d25a0, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:9141
#13 handle_packet_upcall (put_actions=0x7fe1c04d2510, actions=0x7fe1c04d2550, 
key=0x7fe1c04d39c0, packet=<optimized out>, pmd=0x7fe1c04d9010) at 
../lib/dpif-netdev.c:8329
#14 fast_path_processing (pmd=pmd@entry=0x7fe1c04d9010, 
packets_=packets_@entry=0x7fe1c04d4e50, keys=keys@entry=0x7fe1c04d3990, 
flow_map=flow_map@entry=0x7fe1c04d3850, 
index_map=index_map@entry=0x7fe1c04d3840 "", in_port=<optimized out>) at 
../lib/dpif-netdev.c:8425
#15 0x000055b41340df60 in dp_netdev_input__ (pmd=pmd@entry=0x7fe1c04d9010, 
packets=packets@entry=0x7fe1c04d4e50, md_is_valid=md_is_valid@entry=true, 
port_no=port_no@entry=0) at ../lib/dpif-netdev.c:8514
#16 0x000055b41340fdb1 in dp_netdev_recirculate (packets=0x7fe1c04d4e50, 
pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:8560
#17 dp_execute_cb (aux_=aux_@entry=0x7fe1c04d4db0, packets_=<optimized out>, 
packets_@entry=0x7fe1c04d4e50, a=a@entry=0x7fe1c04d5174, 
should_steal=should_steal@entry=true) at ../lib/dpif-netdev.c:8955
#18 0x000055b41343ac54 in odp_execute_actions (dp=<optimized out>, 
batch=0x7fe1c04d4e50, steal=<optimized out>, actions=<optimized out>, 
actions_len=<optimized out>, dp_execute_action=<optimized out>) at 
../lib/odp-execute.c:866
#19 0x000055b41340b6bf in dp_netdev_execute_actions (actions_len=<optimized 
out>, actions=<optimized out>, flow=0x7fe1c04d5360, should_steal=true, 
packets=0x7fe1c04d4e50, pmd=0x7fe1c04d9010) at ../lib/dpif-netdev.c:9141
#20 handle_packet_upcall (put_actions=0x7fe1c04d4dc0, actions=0x7fe1c04d4e00, 
key=0x7fe1c04d6280, packet=<optimized out>, pmd=0x7fe1c04d9010) at 
../lib/dpif-netdev.c:8329
#21 fast_path_processing (pmd=pmd@entry=0x7fe1c04d9010, 
packets_=packets_@entry=0x7fe1c04d66a0, keys=keys@entry=0x7fe1c04d6240, 
flow_map=flow_map@entry=0x7fe1c04d6100, 
index_map=index_map@entry=0x7fe1c04d60f0 "", in_port=<optimized out>) at 
../lib/dpif-netdev.c:8425
#22 0x000055b41340df60 in dp_netdev_input__ (pmd=<optimized out>, 
packets=<optimized out>, md_is_valid=md_is_valid@entry=false, 
port_no=<optimized out>) at ../lib/dpif-netdev.c:8514
#23 0x000055b413411da1 in dp_netdev_input (pmd=<optimized out>, 
packets=<optimized out>, port_no=<optimized out>) at ../lib/dpif-netdev.c:8552
#24 0x000055b413405487 in dp_netdev_process_rxq_port (pmd=0x7fe1c04d9010, 
rxq=0x55b418870e60, port_no=28) at ../lib/dpif-netdev.c:5384
#25 0x000055b4134134aa in pmd_thread_main (f_=<optimized out>) at 
../lib/dpif-netdev.c:6992
#26 0x000055b41347f523 in ovsthread_wrapper (aux_=<optimized out>) at 
../lib/ovs-thread.c:422
#27 0x00007fe212c1aac3 in start_thread (arg=<optimized out>) at 
./nptl/pthread_create.c:442
#28 0x00007fe212cac850 in clone3 () at 
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Maybe this commit 
(https://github.com/openvswitch/ovs/commit/78387e88bdcf4704b88584405f66eaeb90103139)
 already solves the problem?
If so, would it be possible to backport this commit to 2.17?

Thanks for any feedback

Michael
_______________________________________________
discuss mailing list
disc...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-discuss

Reply via email to