Hi Yunjian, Wednesday, March 6, 2019 5:06 AM, Zhaohui (zhaohui, Polestar): > Subject: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event > > Hi: > I think the flow list may be accessed in the same time by two different > threads and may cause some errors. Do it need a lock to protect the flow list?
In general it is the application responsibility to synchronize the control between the different threads. The PMD does not support multi-thread control what so ever. On this specific case, when working on top of bond, the bond reacts to the LSC event while, I assume, your application is inserting/deleting rules using rte_flow APIs. The right fix should be for the bond PMD to mutex before calling rte_eth_dev_default_mac_addr_set. > > > Thanks > > Yunjian > (gdb) bt > #0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6 > #1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6 > #2 0x00000000006a8749 in PAT_abort () > #3 0x00000000006a588d in patchIllInsHandler () > #4 <signal handler called> > #5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev@entry=0xad8940 > <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at > /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150 > #6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940 > <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4- > pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170 > #7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at > /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384 > #8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940 > <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd- > 18.11/drivers/net/mlx5/mlx5_trigger.c:400 > #9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set > (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk- > 18.11/lib/librte_ethdev/rte_ethdev.c:3230 > #10 0x00007f54cd8dee81 in mac_address_slaves_update > (bonded_eth_dev=bonded_eth_dev@entry=0xad48c0 <rte_eth_devices>) > at /usr/src/debug/dpdk- > 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842 > #11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback > (port_id=<optimized out>, type=<optimized out>, param=<optimized out>, > ret_param=<optimized out>) at /usr/src/debug/dpdk- > 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070 > #12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0 > <rte_eth_devices>) at /usr/src/debug/dpdk- > 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298 > #13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at > /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90 > #14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized > out>, events=<optimized out>) at /usr/src/debug/dpdk- > 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838 > #15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at > /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885 > #16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk- > 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965 > #17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0 > #18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6 > > > In order to solve this problem(core dump). Something modified like this:( > Looking forward to your reply) > > From: zhaohui8 <zhaoh...@huawei.com> > --- > drivers/net/mlx5/mlx5.c | 1 + > drivers/net/mlx5/mlx5.h | 1 + > drivers/net/mlx5/mlx5_flow.c | 33 ++++++++++++++++++++++++++------ > - > drivers/net/mlx5/mlx5_trigger.c | 12 +++++++++++- > 4 files changed, 39 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 9e5cab1..e8ae816 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -1195,6 +1195,7 @@ > priv->tcf_context = NULL; > } > } > + rte_rwlock_init(&priv->flows_rwlock); > TAILQ_INIT(&priv->flows); > TAILQ_INIT(&priv->ctrl_flows); > /* Hint libmlx5 to use PMD allocator for data plane resources */ diff -- > git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index > bc500b2..cb8657c 100644 > --- a/drivers/net/mlx5/mlx5.h > +++ b/drivers/net/mlx5/mlx5.h > @@ -202,6 +202,7 @@ struct priv { > unsigned int (*reta_idx)[]; /* RETA index table. */ > unsigned int reta_idx_n; /* RETA index size. */ > struct mlx5_drop drop_queue; /* Flow drop queues. */ > + rte_rwlock_t flows_rwlock; /* flows Lock. */ > struct mlx5_flows flows; /* RTE Flow rules. */ > struct mlx5_flows ctrl_flows; /* Control flow rules. */ > LIST_HEAD(counters, mlx5_flow_counter) flow_counters; diff --git > a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index > 97dc3e1..2c18602 100644 > --- a/drivers/net/mlx5/mlx5_flow.c > +++ b/drivers/net/mlx5/mlx5_flow.c > @@ -2121,9 +2121,13 @@ struct rte_flow * > const struct rte_flow_action actions[], > struct rte_flow_error *error) > { > - return flow_list_create(dev, > + struct rte_flow * flow; > + rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)- > >flows_rwlock); > + flow = flow_list_create(dev, > &((struct priv *)dev->data->dev_private)- > >flows, > attr, items, actions, error); > + rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)- > >flows_rwlock); > + return flow; > } > > /** > @@ -2235,12 +2239,13 @@ struct rte_flow * > struct priv *priv = dev->data->dev_private; > struct rte_flow *flow; > int ret = 0; > - > + rte_rwlock_read_lock(&priv->flows_rwlock); > TAILQ_FOREACH(flow, &priv->flows, next) { > DRV_LOG(DEBUG, "port %u flow %p still referenced", > dev->data->port_id, (void *)flow); > ++ret; > } > + rte_rwlock_read_unlock(&priv->flows_rwlock); > return ret; > } > > @@ -2320,10 +2325,14 @@ struct rte_flow * > } > for (i = 0; i != priv->reta_idx_n; ++i) > queue[i] = (*priv->reta_idx)[i]; > + rte_rwlock_write_lock(&priv->flows_rwlock); > flow = flow_list_create(dev, &priv->ctrl_flows, > &attr, items, actions, &error); > - if (!flow) > + if (!flow) { > + rte_rwlock_write_unlock(&priv->flows_rwlock); > return -rte_errno; > + } > + rte_rwlock_write_unlock(&priv->flows_rwlock); > return 0; > } > > @@ -2360,8 +2369,9 @@ struct rte_flow * > struct rte_flow_error *error __rte_unused) { > struct priv *priv = dev->data->dev_private; > - > + rte_rwlock_write_lock(&priv->flows_rwlock); > flow_list_destroy(dev, &priv->flows, flow); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > return 0; > } > > @@ -2376,8 +2386,9 @@ struct rte_flow * > struct rte_flow_error *error __rte_unused) { > struct priv *priv = dev->data->dev_private; > - > + rte_rwlock_write_lock(&priv->flows_rwlock); > mlx5_flow_list_flush(dev, &priv->flows); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > return 0; > } > > @@ -2729,17 +2740,22 @@ struct rte_flow * > ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow); > if (ret) > goto error; > + rte_rwlock_write_lock(&priv->flows_rwlock); > flow = flow_fdir_filter_lookup(dev, fdir_flow); > if (flow) { > rte_errno = EEXIST; > + rte_rwlock_write_unlock(&priv->flows_rwlock); > goto error; > } > flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr, > fdir_flow->items, fdir_flow->actions, NULL); > - if (!flow) > + if (!flow) { > + rte_rwlock_write_unlock(&priv->flows_rwlock); > goto error; > + } > assert(!flow->fdir); > flow->fdir = fdir_flow; > + rte_rwlock_write_unlock(&priv->flows_rwlock); > DRV_LOG(DEBUG, "port %u created FDIR flow %p", > dev->data->port_id, (void *)flow); > return 0; > @@ -2773,6 +2789,7 @@ struct rte_flow * > ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow); > if (ret) > return -rte_errno; > + rte_rwlock_write_lock(&priv->flows_rwlock); > flow = flow_fdir_filter_lookup(dev, &fdir_flow); > if (!flow) { > rte_errno = ENOENT; > @@ -2781,6 +2798,7 @@ struct rte_flow * > flow_list_destroy(dev, &priv->flows, flow); > DRV_LOG(DEBUG, "port %u deleted FDIR flow %p", > dev->data->port_id, (void *)flow); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > return 0; > } > > @@ -2817,8 +2835,9 @@ struct rte_flow * > flow_fdir_filter_flush(struct rte_eth_dev *dev) { > struct priv *priv = dev->data->dev_private; > - > + rte_rwlock_write_lock(&priv->flows_rwlock); > mlx5_flow_list_flush(dev, &priv->flows); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > } > > /** > diff --git a/drivers/net/mlx5/mlx5_trigger.c > b/drivers/net/mlx5/mlx5_trigger.c index e2a9bb7..b95c7cf 100644 > --- a/drivers/net/mlx5/mlx5_trigger.c > +++ b/drivers/net/mlx5/mlx5_trigger.c > @@ -188,12 +188,15 @@ > dev->data->port_id); > goto error; > } > + rte_rwlock_read_lock(&priv->flows_rwlock); > ret = mlx5_flow_start(dev, &priv->flows); > if (ret) { > DRV_LOG(DEBUG, "port %u failed to set flows", > dev->data->port_id); > + rte_rwlock_read_unlock(&priv->flows_rwlock); > goto error; > } > + rte_rwlock_read_unlock(&priv->flows_rwlock); > dev->tx_pkt_burst = mlx5_select_tx_function(dev); > dev->rx_pkt_burst = mlx5_select_rx_function(dev); > mlx5_dev_interrupt_handler_install(dev); > @@ -202,7 +205,9 @@ > ret = rte_errno; /* Save rte_errno before cleanup. */ > /* Rollback. */ > dev->data->dev_started = 0; > + rte_rwlock_write_lock(&priv->flows_rwlock); > mlx5_flow_stop(dev, &priv->flows); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > mlx5_traffic_disable(dev); > mlx5_txq_stop(dev); > mlx5_rxq_stop(dev); > @@ -230,7 +235,9 @@ > rte_wmb(); > usleep(1000 * priv->rxqs_n); > DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id); > + rte_rwlock_write_lock(&priv->flows_rwlock); > mlx5_flow_stop(dev, &priv->flows); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > mlx5_traffic_disable(dev); > mlx5_rx_intr_vec_disable(dev); > mlx5_dev_interrupt_handler_uninstall(dev); > @@ -364,7 +371,9 @@ > return 0; > error: > ret = rte_errno; /* Save rte_errno before cleanup. */ > + rte_rwlock_write_lock(&priv->flows_rwlock); > mlx5_flow_list_flush(dev, &priv->ctrl_flows); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > rte_errno = ret; /* Restore rte_errno. */ > return -rte_errno; > } > @@ -380,8 +389,9 @@ > mlx5_traffic_disable(struct rte_eth_dev *dev) { > struct priv *priv = dev->data->dev_private; > - > + rte_rwlock_write_lock(&priv->flows_rwlock); > mlx5_flow_list_flush(dev, &priv->ctrl_flows); > + rte_rwlock_write_unlock(&priv->flows_rwlock); > } > > > -----邮件原件----- > 发件人: wangyunjian > 发送时间: 2019年2月22日 15:34 > 收件人: dev@dpdk.org; shah...@mellanox.com; ys...@mellanox.com > 抄送: xudingke <xudin...@huawei.com>; Zhaohui (zhaohui, Polestar) > <zhaoh...@huawei.com> > 主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event