Reviewed-by: Xueming Li <xuemi...@mellanox.com>
> -----Original Message----- > From: dev <dev-boun...@dpdk.org> On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler <shah...@mellanox.com> > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing > code > > All the generic probing code needs is an IB device. While this device is > currently supplied by a PCI > lookup, other methods will be added soon. > > This patch divides the original function, which has become huge over time, as > follows: > > 1. PCI-specific (mlx5_pci_probe()). > 2. All ports of a Verbs device (mlx5_dev_spawn()). > 3. A given port of a Verbs device (mlx5_dev_spawn_one()). > > (Patch based on prior work from Yuanhan Liu) > > Signed-off-by: Adrien Mazarguil <adrien.mazarg...@6wind.com> > -- > v2 changes: > > - Fixed device naming. A port suffix is now appended only if several IB > ports happen to be detected. > - Added separate message to distinguish missing kernel drivers from other > initialization errors, as it was confusing. > --- > drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++----------------- > 1 file changed, 209 insertions(+), 131 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 1a5391e63..01dcf25b9 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) } > > /** > - * DPDK callback to register a PCI device. > - * > - * This function creates an Ethernet device for each port of a given > - * PCI device. > + * Spawn an Ethernet device from Verbs information. > * > - * @param[in] pci_drv > - * PCI driver structure (mlx5_driver). > - * @param[in] pci_dev > - * PCI device information. > + * @param dpdk_dev > + * Backing DPDK device. > + * @param ibv_dev > + * Verbs device. > + * @param vf > + * If nonzero, enable VF-specific features. > + * @param[in] attr > + * Verbs device attributes. > + * @param port > + * Verbs port to use (indexed from 1). > * > * @return > - * 0 on success, a negative errno value otherwise and rte_errno is set. > + * A valid Ethernet device object on success, NULL otherwise and rte_errno > + * is set. > */ > -static int > -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > - struct rte_pci_device *pci_dev) > +static struct rte_eth_dev * > +mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > + struct ibv_device *ibv_dev, > + int vf, > + const struct ibv_device_attr_ex *attr, > + unsigned int port) > { > - struct ibv_device **list = NULL; > - struct ibv_device *ibv_dev; > - struct ibv_context *ctx = NULL; > - struct ibv_device_attr_ex attr; > + struct ibv_context *ctx; > struct mlx5dv_context dv_attr = { .comp_mask = 0 }; > + struct rte_eth_dev *eth_dev = NULL; > int err = 0; > - unsigned int vf = 0; > unsigned int mps; > unsigned int cqe_comp; > unsigned int tunnel_en = 0; > @@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > unsigned int mprq_max_stride_size_n = 0; > unsigned int mprq_min_stride_num_n = 0; > unsigned int mprq_max_stride_num_n = 0; > - int i; > #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT > struct ibv_counter_set_description cs_desc = { .counter_type = 0 }; > #endif > > /* Prepare shared data between primary and secondary process. */ > mlx5_prepare_shared_data(); > - assert(pci_drv == &mlx5_driver); > - list = mlx5_glue->get_device_list(&i); > - if (list == NULL) { > - assert(errno); > - err = errno; > - if (errno == ENOSYS) > - DRV_LOG(ERR, > - "cannot list devices, is ib_uverbs loaded?"); > - goto error; > - } > - assert(i >= 0); > - /* > - * For each listed device, check related sysfs entry against > - * the provided PCI ID. > - */ > - while (i != 0) { > - struct rte_pci_addr pci_addr; > - > - --i; > - DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name); > - if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) > - continue; > - if ((pci_dev->addr.domain != pci_addr.domain) || > - (pci_dev->addr.bus != pci_addr.bus) || > - (pci_dev->addr.devid != pci_addr.devid) || > - (pci_dev->addr.function != pci_addr.function)) > - continue; > - DRV_LOG(INFO, "PCI information matches, using device \"%s\"", > - list[i]->name); > - vf = ((pci_dev->id.device_id == > - PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || > - (pci_dev->id.device_id == > - PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || > - (pci_dev->id.device_id == > - PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || > - (pci_dev->id.device_id == > - PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); > - ctx = mlx5_glue->open_device(list[i]); > - rte_errno = errno; > - err = rte_errno; > - break; > - } > - if (ctx == NULL) { > - switch (err) { > - case 0: > - DRV_LOG(ERR, > - "cannot access device, is mlx5_ib loaded?"); > - err = ENODEV; > - break; > - case EINVAL: > - DRV_LOG(ERR, > - "cannot use device, are drivers up to date?"); > - break; > - } > - goto error; > + errno = 0; > + ctx = mlx5_glue->open_device(ibv_dev); > + if (!ctx) { > + rte_errno = errno ? errno : ENODEV; > + return NULL; > } > - ibv_dev = list[i]; > - DRV_LOG(DEBUG, "device opened"); > #ifdef HAVE_IBV_MLX5_MOD_SWP > dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; #endif @@ -822,20 > +773,11 @@ > mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" > " old OFED/rdma-core version or firmware configuration"); > #endif > - err = mlx5_glue->query_device_ex(ctx, NULL, &attr); > - if (err) { > - DEBUG("ibv_query_device_ex() failed"); > - goto error; > - } > - DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt); > - for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) { > + { > char name[RTE_ETH_NAME_MAX_LEN]; > - int len; > - uint32_t port = i + 1; /* ports are indexed from one */ > struct ibv_port_attr port_attr; > struct ibv_pd *pd = NULL; > struct priv *priv = NULL; > - struct rte_eth_dev *eth_dev = NULL; > struct ether_addr mac; > struct mlx5_dev_config config = { > .cqe_comp = cqe_comp, > @@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > }, > }; > > - len = snprintf(name, sizeof(name), PCI_PRI_FMT, > - pci_dev->addr.domain, pci_dev->addr.bus, > - pci_dev->addr.devid, pci_dev->addr.function); > - if (attr.orig_attr.phys_port_cnt > 1) > - snprintf(name + len, sizeof(name), " port %u", i); > + if (attr->orig_attr.phys_port_cnt > 1) > + snprintf(name, sizeof(name), "%s port %u", > + dpdk_dev->name, port); > + else > + snprintf(name, sizeof(name), "%s", dpdk_dev->name); > if (rte_eal_process_type() == RTE_PROC_SECONDARY) { > eth_dev = rte_eth_dev_attach_secondary(name); > if (eth_dev == NULL) { > @@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > err = rte_errno; > goto error; > } > - eth_dev->device = &pci_dev->device; > + eth_dev->device = dpdk_dev; > eth_dev->dev_ops = &mlx5_dev_sec_ops; > err = mlx5_uar_init_secondary(eth_dev); > if (err) { > @@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > mlx5_select_rx_function(eth_dev); > eth_dev->tx_pkt_burst = > mlx5_select_tx_function(eth_dev); > - rte_eth_dev_probing_finish(eth_dev); > - continue; > + mlx5_glue->close_device(ctx); > + return eth_dev; > } > DRV_LOG(DEBUG, "using port %u", port); > - if (!ctx) > - ctx = mlx5_glue->open_device(ibv_dev); > - if (ctx == NULL) { > - err = ENODEV; > - goto port_error; > - } > /* Check port status. */ > err = mlx5_glue->query_port(ctx, port, &port_attr); > if (err) { > @@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > priv->ctx = ctx; > strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path, > sizeof(priv->ibdev_path)); > - priv->device_attr = attr; > + priv->device_attr = *attr; > priv->port = port; > priv->pd = pd; > priv->mtu = ETHER_MTU; > - err = mlx5_args(&config, pci_dev->device.devargs); > + err = mlx5_args(&config, dpdk_dev->devargs); > if (err) { > err = rte_errno; > DRV_LOG(ERR, "failed to process device arguments: %s", > strerror(rte_errno)); > goto port_error; > } > - config.hw_csum = !!(attr.device_cap_flags_ex & > + config.hw_csum = !!(attr->device_cap_flags_ex & > IBV_DEVICE_RAW_IP_CSUM); > DRV_LOG(DEBUG, "checksum offloading is %ssupported", > (config.hw_csum ? "" : "not ")); > #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT > - config.flow_counter_en = !!attr.max_counter_sets; > + config.flow_counter_en = !!attr->max_counter_sets; > mlx5_glue->describe_counter_set(ctx, 0, &cs_desc); > DRV_LOG(DEBUG, > "counter type = %d, num of cs = %ld, attributes = %d", > @@ -971,7 +907,7 @@ > mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > cs_desc.attributes); > #endif > config.ind_table_max_size = > - attr.rss_caps.max_rwq_indirection_table_size; > + attr->rss_caps.max_rwq_indirection_table_size; > /* Remove this check once DPDK supports larger/variable > * indirection tables. */ > if (config.ind_table_max_size > > @@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; > DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", > config.ind_table_max_size); > - config.hw_vlan_strip = !!(attr.raw_packet_caps & > + config.hw_vlan_strip = !!(attr->raw_packet_caps & > IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); > DRV_LOG(DEBUG, "VLAN stripping is %ssupported", > (config.hw_vlan_strip ? "" : "not ")); > > - config.hw_fcs_strip = !!(attr.raw_packet_caps & > + config.hw_fcs_strip = !!(attr->raw_packet_caps & > IBV_RAW_PACKET_CAP_SCATTER_FCS); > DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", > (config.hw_fcs_strip ? "" : "not ")); > > #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING > - config.hw_padding = !!attr.rx_pad_end_addr_align; > + config.hw_padding = !!attr->rx_pad_end_addr_align; > #endif > DRV_LOG(DEBUG, > "hardware Rx end alignment padding is %ssupported", > (config.hw_padding ? "" : "not ")); > config.vf = vf; > - config.tso = (attr.tso_caps.max_tso > 0 && > - (attr.tso_caps.supported_qpts & > + config.tso = (attr->tso_caps.max_tso > 0 && > + (attr->tso_caps.supported_qpts & > (1 << IBV_QPT_RAW_PACKET))); > if (config.tso) > - config.tso_max_payload_sz = attr.tso_caps.max_tso; > + config.tso_max_payload_sz = attr->tso_caps.max_tso; > if (config.mps && !mps) { > DRV_LOG(ERR, > "multi-packet send not supported on this device" > @@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > eth_dev->data->dev_private = priv; > priv->dev_data = eth_dev->data; > eth_dev->data->mac_addrs = priv->mac; > - eth_dev->device = &pci_dev->device; > - rte_eth_copy_pci_info(eth_dev, pci_dev); > + eth_dev->device = dpdk_dev; > eth_dev->device->driver = &mlx5_driver.driver; > err = mlx5_uar_init_primary(eth_dev); > if (err) { > @@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > priv, mem_event_cb); > rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); > rte_eth_dev_probing_finish(eth_dev); > - /* > - * Each eth_dev instance is assigned its own Verbs context, > - * since this one is consumed, let the next iteration open > - * another. > - */ > - ctx = NULL; > - continue; > + return eth_dev; > port_error: > if (priv) > rte_free(priv); > @@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > claim_zero(mlx5_glue->dealloc_pd(pd)); > if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY) > rte_eth_dev_release_port(eth_dev); > - break; > } > - /* > - * XXX if something went wrong in the loop above, there is a resource > - * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as > - * long as the dpdk does not provide a way to deallocate a ethdev and a > - * way to enumerate the registered ethdevs to free the previous ones. > - */ > error: > if (ctx) > claim_zero(mlx5_glue->close_device(ctx)); > - if (list) > - mlx5_glue->free_device_list(list); > - if (err) { > - rte_errno = err; > + assert(err > 0); > + rte_errno = err; > + return NULL; > +} > + > +/** > + * Spawn Ethernet devices from Verbs information, one per detected port. > + * > + * @param dpdk_dev > + * Backing DPDK device. > + * @param ibv_dev > + * Verbs device. > + * @param vf > + * If nonzero, enable VF-specific features. > + * > + * @return > + * A NULL-terminated list of Ethernet device objects on success, NULL > + * otherwise and rte_errno is set. Caller is expected to release list > + * memory through free(). > + */ > +static struct rte_eth_dev ** > +mlx5_dev_spawn(struct rte_device *dpdk_dev, > + struct ibv_device *ibv_dev, > + int vf) > +{ > + struct rte_eth_dev **eth_list = NULL; > + struct ibv_context *ctx; > + struct ibv_device_attr_ex attr; > + unsigned int i; > + int ret; > + > + errno = 0; > + ctx = mlx5_glue->open_device(ibv_dev); > + if (!ctx) { > + rte_errno = errno ? errno : ENODEV; > + if (rte_errno == ENODEV) > + DRV_LOG(ERR, > + "cannot access device, is mlx5_ib loaded?"); > + else > + DRV_LOG(ERR, > + "cannot use device, are drivers up to date?"); > + return NULL; > + } > + ret = mlx5_glue->query_device_ex(ctx, NULL, &attr); > + mlx5_glue->close_device(ctx); > + if (ret) { > + rte_errno = ret; > + DRV_LOG(ERR, "unable to query device information: %s", > + strerror(rte_errno)); > + return NULL; > + } > + DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt); > + eth_list = malloc(sizeof(*eth_list) * > + (attr.orig_attr.phys_port_cnt + 1)); > + if (!eth_list) { > + rte_errno = errno; > + return NULL; > + } > + for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) { > + eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf, > + &attr, i + 1); > + if (eth_list[i]) > + continue; > + /* Save rte_errno and roll back in case of failure. */ > + ret = rte_errno; > + while (i--) { > + mlx5_dev_close(eth_list[i]); > + if (rte_eal_process_type() == RTE_PROC_PRIMARY) > + rte_free(eth_list[i]->data->dev_private); > + claim_zero(rte_eth_dev_release_port(eth_list[i])); > + } > + free(eth_list); > + rte_errno = ret; > + return NULL; > + } > + eth_list[i] = NULL; > + return eth_list; > +} > + > +/** > + * DPDK callback to register a PCI device. > + * > + * This function creates an Ethernet device for each port of a given > + * PCI device. > + * > + * @param[in] pci_drv > + * PCI driver structure (mlx5_driver). > + * @param[in] pci_dev > + * PCI device information. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set. > + */ > +static int > +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > + struct rte_pci_device *pci_dev) { > + struct ibv_device **ibv_list; > + struct rte_eth_dev **eth_list = NULL; > + int vf; > + int ret; > + > + assert(pci_drv == &mlx5_driver); > + switch (pci_dev->id.device_id) { > + case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: > + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: > + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: > + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: > + vf = 1; > + break; > + default: > + vf = 0; > + } > + errno = 0; > + ibv_list = mlx5_glue->get_device_list(&ret); > + if (!ibv_list) { > + rte_errno = errno ? errno : ENOSYS; > + DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); > return -rte_errno; > } > - return 0; > + while (ret-- > 0) { > + struct rte_pci_addr pci_addr; > + > + DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); > + if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr)) > + continue; > + if (pci_dev->addr.domain != pci_addr.domain || > + pci_dev->addr.bus != pci_addr.bus || > + pci_dev->addr.devid != pci_addr.devid || > + pci_dev->addr.function != pci_addr.function) > + continue; > + DRV_LOG(INFO, "PCI information matches, using device \"%s\"", > + ibv_list[ret]->name); > + break; > + } > + if (ret >= 0) > + eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf); > + mlx5_glue->free_device_list(ibv_list); > + if (!ret) { > + DRV_LOG(WARNING, > + "no Verbs device matches PCI device " PCI_PRI_FMT "," > + " are kernel drivers loaded?", > + pci_dev->addr.domain, pci_dev->addr.bus, > + pci_dev->addr.devid, pci_dev->addr.function); > + rte_errno = ENOENT; > + ret = -rte_errno; > + } else if (!eth_list || !*eth_list) { > + DRV_LOG(ERR, > + "probe of PCI device " PCI_PRI_FMT " aborted after" > + " encountering an error: %s", > + pci_dev->addr.domain, pci_dev->addr.bus, > + pci_dev->addr.devid, pci_dev->addr.function, > + strerror(rte_errno)); > + ret = -rte_errno; > + } else { > + for (ret = 0; eth_list[ret]; ++ret) { > + rte_eth_copy_pci_info(eth_list[ret], pci_dev); > + rte_eth_dev_probing_finish(eth_list[ret]); > + } > + ret = 0; > + } > + free(eth_list); > + return ret; > } > > static const struct rte_pci_id mlx5_pci_id_map[] = { > -- > 2.11.0