Hi: I think the flow list may be accessed in the same time by two different threads and may cause some errors. Do it need a lock to protect the flow list?
Thanks Yunjian (gdb) bt #0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6 #1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6 #2 0x00000000006a8749 in PAT_abort () #3 0x00000000006a588d in patchIllInsHandler () #4 <signal handler called> #5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940 <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150 #6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940 <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170 #7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384 #8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940 <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:400 #9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-18.11/lib/librte_ethdev/rte_ethdev.c:3230 #10 0x00007f54cd8dee81 in mac_address_slaves_update (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842 #11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback (port_id=<optimized out>, type=<optimized out>, param=<optimized out>, ret_param=<optimized out>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070 #12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298 #13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90 #14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized out>, events=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838 #15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885 #16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965 #17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0 #18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6 In order to solve this problem(core dump). Something modified like this:(Looking forward to your reply) From: zhaohui8 <zhaoh...@huawei.com> --- drivers/net/mlx5/mlx5.c | 1 + drivers/net/mlx5/mlx5.h | 1 + drivers/net/mlx5/mlx5_flow.c | 33 ++++++++++++++++++++++++++------- drivers/net/mlx5/mlx5_trigger.c | 12 +++++++++++- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 9e5cab1..e8ae816 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -1195,6 +1195,7 @@ priv->tcf_context = NULL; } } + rte_rwlock_init(&priv->flows_rwlock); TAILQ_INIT(&priv->flows); TAILQ_INIT(&priv->ctrl_flows); /* Hint libmlx5 to use PMD allocator for data plane resources */ diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index bc500b2..cb8657c 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -202,6 +202,7 @@ struct priv { unsigned int (*reta_idx)[]; /* RETA index table. */ unsigned int reta_idx_n; /* RETA index size. */ struct mlx5_drop drop_queue; /* Flow drop queues. */ + rte_rwlock_t flows_rwlock; /* flows Lock. */ struct mlx5_flows flows; /* RTE Flow rules. */ struct mlx5_flows ctrl_flows; /* Control flow rules. */ LIST_HEAD(counters, mlx5_flow_counter) flow_counters; diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 97dc3e1..2c18602 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -2121,9 +2121,13 @@ struct rte_flow * const struct rte_flow_action actions[], struct rte_flow_error *error) { - return flow_list_create(dev, + struct rte_flow * flow; + rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)->flows_rwlock); + flow = flow_list_create(dev, &((struct priv *)dev->data->dev_private)->flows, attr, items, actions, error); + rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)->flows_rwlock); + return flow; } /** @@ -2235,12 +2239,13 @@ struct rte_flow * struct priv *priv = dev->data->dev_private; struct rte_flow *flow; int ret = 0; - + rte_rwlock_read_lock(&priv->flows_rwlock); TAILQ_FOREACH(flow, &priv->flows, next) { DRV_LOG(DEBUG, "port %u flow %p still referenced", dev->data->port_id, (void *)flow); ++ret; } + rte_rwlock_read_unlock(&priv->flows_rwlock); return ret; } @@ -2320,10 +2325,14 @@ struct rte_flow * } for (i = 0; i != priv->reta_idx_n; ++i) queue[i] = (*priv->reta_idx)[i]; + rte_rwlock_write_lock(&priv->flows_rwlock); flow = flow_list_create(dev, &priv->ctrl_flows, &attr, items, actions, &error); - if (!flow) + if (!flow) { + rte_rwlock_write_unlock(&priv->flows_rwlock); return -rte_errno; + } + rte_rwlock_write_unlock(&priv->flows_rwlock); return 0; } @@ -2360,8 +2369,9 @@ struct rte_flow * struct rte_flow_error *error __rte_unused) { struct priv *priv = dev->data->dev_private; - + rte_rwlock_write_lock(&priv->flows_rwlock); flow_list_destroy(dev, &priv->flows, flow); + rte_rwlock_write_unlock(&priv->flows_rwlock); return 0; } @@ -2376,8 +2386,9 @@ struct rte_flow * struct rte_flow_error *error __rte_unused) { struct priv *priv = dev->data->dev_private; - + rte_rwlock_write_lock(&priv->flows_rwlock); mlx5_flow_list_flush(dev, &priv->flows); + rte_rwlock_write_unlock(&priv->flows_rwlock); return 0; } @@ -2729,17 +2740,22 @@ struct rte_flow * ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow); if (ret) goto error; + rte_rwlock_write_lock(&priv->flows_rwlock); flow = flow_fdir_filter_lookup(dev, fdir_flow); if (flow) { rte_errno = EEXIST; + rte_rwlock_write_unlock(&priv->flows_rwlock); goto error; } flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr, fdir_flow->items, fdir_flow->actions, NULL); - if (!flow) + if (!flow) { + rte_rwlock_write_unlock(&priv->flows_rwlock); goto error; + } assert(!flow->fdir); flow->fdir = fdir_flow; + rte_rwlock_write_unlock(&priv->flows_rwlock); DRV_LOG(DEBUG, "port %u created FDIR flow %p", dev->data->port_id, (void *)flow); return 0; @@ -2773,6 +2789,7 @@ struct rte_flow * ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow); if (ret) return -rte_errno; + rte_rwlock_write_lock(&priv->flows_rwlock); flow = flow_fdir_filter_lookup(dev, &fdir_flow); if (!flow) { rte_errno = ENOENT; @@ -2781,6 +2798,7 @@ struct rte_flow * flow_list_destroy(dev, &priv->flows, flow); DRV_LOG(DEBUG, "port %u deleted FDIR flow %p", dev->data->port_id, (void *)flow); + rte_rwlock_write_unlock(&priv->flows_rwlock); return 0; } @@ -2817,8 +2835,9 @@ struct rte_flow * flow_fdir_filter_flush(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - + rte_rwlock_write_lock(&priv->flows_rwlock); mlx5_flow_list_flush(dev, &priv->flows); + rte_rwlock_write_unlock(&priv->flows_rwlock); } /** diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c index e2a9bb7..b95c7cf 100644 --- a/drivers/net/mlx5/mlx5_trigger.c +++ b/drivers/net/mlx5/mlx5_trigger.c @@ -188,12 +188,15 @@ dev->data->port_id); goto error; } + rte_rwlock_read_lock(&priv->flows_rwlock); ret = mlx5_flow_start(dev, &priv->flows); if (ret) { DRV_LOG(DEBUG, "port %u failed to set flows", dev->data->port_id); + rte_rwlock_read_unlock(&priv->flows_rwlock); goto error; } + rte_rwlock_read_unlock(&priv->flows_rwlock); dev->tx_pkt_burst = mlx5_select_tx_function(dev); dev->rx_pkt_burst = mlx5_select_rx_function(dev); mlx5_dev_interrupt_handler_install(dev); @@ -202,7 +205,9 @@ ret = rte_errno; /* Save rte_errno before cleanup. */ /* Rollback. */ dev->data->dev_started = 0; + rte_rwlock_write_lock(&priv->flows_rwlock); mlx5_flow_stop(dev, &priv->flows); + rte_rwlock_write_unlock(&priv->flows_rwlock); mlx5_traffic_disable(dev); mlx5_txq_stop(dev); mlx5_rxq_stop(dev); @@ -230,7 +235,9 @@ rte_wmb(); usleep(1000 * priv->rxqs_n); DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id); + rte_rwlock_write_lock(&priv->flows_rwlock); mlx5_flow_stop(dev, &priv->flows); + rte_rwlock_write_unlock(&priv->flows_rwlock); mlx5_traffic_disable(dev); mlx5_rx_intr_vec_disable(dev); mlx5_dev_interrupt_handler_uninstall(dev); @@ -364,7 +371,9 @@ return 0; error: ret = rte_errno; /* Save rte_errno before cleanup. */ + rte_rwlock_write_lock(&priv->flows_rwlock); mlx5_flow_list_flush(dev, &priv->ctrl_flows); + rte_rwlock_write_unlock(&priv->flows_rwlock); rte_errno = ret; /* Restore rte_errno. */ return -rte_errno; } @@ -380,8 +389,9 @@ mlx5_traffic_disable(struct rte_eth_dev *dev) { struct priv *priv = dev->data->dev_private; - + rte_rwlock_write_lock(&priv->flows_rwlock); mlx5_flow_list_flush(dev, &priv->ctrl_flows); + rte_rwlock_write_unlock(&priv->flows_rwlock); } -----邮件原件----- 发件人: wangyunjian 发送时间: 2019年2月22日 15:34 收件人: dev@dpdk.org; shah...@mellanox.com; ys...@mellanox.com 抄送: xudingke <xudin...@huawei.com>; Zhaohui (zhaohui, Polestar) <zhaoh...@huawei.com> 主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event