Reviewed-by: Xueming Li <xuemi...@mellanox.com>

> -----Original Message-----
> From: dev <dev-boun...@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shah...@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing 
> code
> 
> All the generic probing code needs is an IB device. While this device is 
> currently supplied by a PCI
> lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as 
> follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarg...@6wind.com>
> --
> v2 changes:
> 
> - Fixed device naming. A port suffix is now appended only if several IB
>   ports happen to be detected.
> - Added separate message to distinguish missing kernel drivers from other
>   initialization errors, as it was confusing.
> ---
>  drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
>  1 file changed, 209 insertions(+), 131 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 
> 1a5391e63..01dcf25b9 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)  }
> 
>  /**
> - * DPDK callback to register a PCI device.
> - *
> - * This function creates an Ethernet device for each port of a given
> - * PCI device.
> + * Spawn an Ethernet device from Verbs information.
>   *
> - * @param[in] pci_drv
> - *   PCI driver structure (mlx5_driver).
> - * @param[in] pci_dev
> - *   PCI device information.
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + * @param[in] attr
> + *   Verbs device attributes.
> + * @param port
> + *   Verbs port to use (indexed from 1).
>   *
>   * @return
> - *   0 on success, a negative errno value otherwise and rte_errno is set.
> + *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> + *   is set.
>   */
> -static int
> -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> -            struct rte_pci_device *pci_dev)
> +static struct rte_eth_dev *
> +mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> +                struct ibv_device *ibv_dev,
> +                int vf,
> +                const struct ibv_device_attr_ex *attr,
> +                unsigned int port)
>  {
> -     struct ibv_device **list = NULL;
> -     struct ibv_device *ibv_dev;
> -     struct ibv_context *ctx = NULL;
> -     struct ibv_device_attr_ex attr;
> +     struct ibv_context *ctx;
>       struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> +     struct rte_eth_dev *eth_dev = NULL;
>       int err = 0;
> -     unsigned int vf = 0;
>       unsigned int mps;
>       unsigned int cqe_comp;
>       unsigned int tunnel_en = 0;
> @@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>       unsigned int mprq_max_stride_size_n = 0;
>       unsigned int mprq_min_stride_num_n = 0;
>       unsigned int mprq_max_stride_num_n = 0;
> -     int i;
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>       struct ibv_counter_set_description cs_desc = { .counter_type = 0 };  
> #endif
> 
>       /* Prepare shared data between primary and secondary process. */
>       mlx5_prepare_shared_data();
> -     assert(pci_drv == &mlx5_driver);
> -     list = mlx5_glue->get_device_list(&i);
> -     if (list == NULL) {
> -             assert(errno);
> -             err = errno;
> -             if (errno == ENOSYS)
> -                     DRV_LOG(ERR,
> -                             "cannot list devices, is ib_uverbs loaded?");
> -             goto error;
> -     }
> -     assert(i >= 0);
> -     /*
> -      * For each listed device, check related sysfs entry against
> -      * the provided PCI ID.
> -      */
> -     while (i != 0) {
> -             struct rte_pci_addr pci_addr;
> -
> -             --i;
> -             DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
> -             if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
> -                     continue;
> -             if ((pci_dev->addr.domain != pci_addr.domain) ||
> -                 (pci_dev->addr.bus != pci_addr.bus) ||
> -                 (pci_dev->addr.devid != pci_addr.devid) ||
> -                 (pci_dev->addr.function != pci_addr.function))
> -                     continue;
> -             DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> -                     list[i]->name);
> -             vf = ((pci_dev->id.device_id ==
> -                    PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
> -                   (pci_dev->id.device_id ==
> -                    PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
> -                   (pci_dev->id.device_id ==
> -                    PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
> -                   (pci_dev->id.device_id ==
> -                    PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -             ctx = mlx5_glue->open_device(list[i]);
> -             rte_errno = errno;
> -             err = rte_errno;
> -             break;
> -     }
> -     if (ctx == NULL) {
> -             switch (err) {
> -             case 0:
> -                     DRV_LOG(ERR,
> -                             "cannot access device, is mlx5_ib loaded?");
> -                     err = ENODEV;
> -                     break;
> -             case EINVAL:
> -                     DRV_LOG(ERR,
> -                             "cannot use device, are drivers up to date?");
> -                     break;
> -             }
> -             goto error;
> +     errno = 0;
> +     ctx = mlx5_glue->open_device(ibv_dev);
> +     if (!ctx) {
> +             rte_errno = errno ? errno : ENODEV;
> +             return NULL;
>       }
> -     ibv_dev = list[i];
> -     DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>       dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;  #endif @@ -822,20 
> +773,11 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>       DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>               " old OFED/rdma-core version or firmware configuration");  
> #endif
> -     err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> -     if (err) {
> -             DEBUG("ibv_query_device_ex() failed");
> -             goto error;
> -     }
> -     DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -     for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
> +     {
>               char name[RTE_ETH_NAME_MAX_LEN];
> -             int len;
> -             uint32_t port = i + 1; /* ports are indexed from one */
>               struct ibv_port_attr port_attr;
>               struct ibv_pd *pd = NULL;
>               struct priv *priv = NULL;
> -             struct rte_eth_dev *eth_dev = NULL;
>               struct ether_addr mac;
>               struct mlx5_dev_config config = {
>                       .cqe_comp = cqe_comp,
> @@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>                       },
>               };
> 
> -             len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> -                      pci_dev->addr.domain, pci_dev->addr.bus,
> -                      pci_dev->addr.devid, pci_dev->addr.function);
> -             if (attr.orig_attr.phys_port_cnt > 1)
> -                     snprintf(name + len, sizeof(name), " port %u", i);
> +             if (attr->orig_attr.phys_port_cnt > 1)
> +                     snprintf(name, sizeof(name), "%s port %u",
> +                              dpdk_dev->name, port);
> +             else
> +                     snprintf(name, sizeof(name), "%s", dpdk_dev->name);
>               if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>                       eth_dev = rte_eth_dev_attach_secondary(name);
>                       if (eth_dev == NULL) {
> @@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>                               err = rte_errno;
>                               goto error;
>                       }
> -                     eth_dev->device = &pci_dev->device;
> +                     eth_dev->device = dpdk_dev;
>                       eth_dev->dev_ops = &mlx5_dev_sec_ops;
>                       err = mlx5_uar_init_secondary(eth_dev);
>                       if (err) {
> @@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>                               mlx5_select_rx_function(eth_dev);
>                       eth_dev->tx_pkt_burst =
>                               mlx5_select_tx_function(eth_dev);
> -                     rte_eth_dev_probing_finish(eth_dev);
> -                     continue;
> +                     mlx5_glue->close_device(ctx);
> +                     return eth_dev;
>               }
>               DRV_LOG(DEBUG, "using port %u", port);
> -             if (!ctx)
> -                     ctx = mlx5_glue->open_device(ibv_dev);
> -             if (ctx == NULL) {
> -                     err = ENODEV;
> -                     goto port_error;
> -             }
>               /* Check port status. */
>               err = mlx5_glue->query_port(ctx, port, &port_attr);
>               if (err) {
> @@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>               priv->ctx = ctx;
>               strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>                       sizeof(priv->ibdev_path));
> -             priv->device_attr = attr;
> +             priv->device_attr = *attr;
>               priv->port = port;
>               priv->pd = pd;
>               priv->mtu = ETHER_MTU;
> -             err = mlx5_args(&config, pci_dev->device.devargs);
> +             err = mlx5_args(&config, dpdk_dev->devargs);
>               if (err) {
>                       err = rte_errno;
>                       DRV_LOG(ERR, "failed to process device arguments: %s",
>                               strerror(rte_errno));
>                       goto port_error;
>               }
> -             config.hw_csum = !!(attr.device_cap_flags_ex &
> +             config.hw_csum = !!(attr->device_cap_flags_ex &
>                                   IBV_DEVICE_RAW_IP_CSUM);
>               DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>                       (config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -             config.flow_counter_en = !!attr.max_counter_sets;
> +             config.flow_counter_en = !!attr->max_counter_sets;
>               mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>               DRV_LOG(DEBUG,
>                       "counter type = %d, num of cs = %ld, attributes = %d", 
> @@ -971,7 +907,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>                       cs_desc.attributes);
>  #endif
>               config.ind_table_max_size =
> -                     attr.rss_caps.max_rwq_indirection_table_size;
> +                     attr->rss_caps.max_rwq_indirection_table_size;
>               /* Remove this check once DPDK supports larger/variable
>                * indirection tables. */
>               if (config.ind_table_max_size >
> @@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>                       config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>               DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>                       config.ind_table_max_size);
> -             config.hw_vlan_strip = !!(attr.raw_packet_caps &
> +             config.hw_vlan_strip = !!(attr->raw_packet_caps &
>                                        IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>               DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>                       (config.hw_vlan_strip ? "" : "not "));
> 
> -             config.hw_fcs_strip = !!(attr.raw_packet_caps &
> +             config.hw_fcs_strip = !!(attr->raw_packet_caps &
>                                        IBV_RAW_PACKET_CAP_SCATTER_FCS);
>               DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>                       (config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -             config.hw_padding = !!attr.rx_pad_end_addr_align;
> +             config.hw_padding = !!attr->rx_pad_end_addr_align;
>  #endif
>               DRV_LOG(DEBUG,
>                       "hardware Rx end alignment padding is %ssupported",
>                       (config.hw_padding ? "" : "not "));
>               config.vf = vf;
> -             config.tso = (attr.tso_caps.max_tso > 0 &&
> -                           (attr.tso_caps.supported_qpts &
> +             config.tso = (attr->tso_caps.max_tso > 0 &&
> +                           (attr->tso_caps.supported_qpts &
>                              (1 << IBV_QPT_RAW_PACKET)));
>               if (config.tso)
> -                     config.tso_max_payload_sz = attr.tso_caps.max_tso;
> +                     config.tso_max_payload_sz = attr->tso_caps.max_tso;
>               if (config.mps && !mps) {
>                       DRV_LOG(ERR,
>                               "multi-packet send not supported on this device"
> @@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>               eth_dev->data->dev_private = priv;
>               priv->dev_data = eth_dev->data;
>               eth_dev->data->mac_addrs = priv->mac;
> -             eth_dev->device = &pci_dev->device;
> -             rte_eth_copy_pci_info(eth_dev, pci_dev);
> +             eth_dev->device = dpdk_dev;
>               eth_dev->device->driver = &mlx5_driver.driver;
>               err = mlx5_uar_init_primary(eth_dev);
>               if (err) {
> @@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>                                priv, mem_event_cb);
>               rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>               rte_eth_dev_probing_finish(eth_dev);
> -             /*
> -              * Each eth_dev instance is assigned its own Verbs context,
> -              * since this one is consumed, let the next iteration open
> -              * another.
> -              */
> -             ctx = NULL;
> -             continue;
> +             return eth_dev;
>  port_error:
>               if (priv)
>                       rte_free(priv);
> @@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv 
> __rte_unused,
>                       claim_zero(mlx5_glue->dealloc_pd(pd));
>               if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>                       rte_eth_dev_release_port(eth_dev);
> -             break;
>       }
> -     /*
> -      * XXX if something went wrong in the loop above, there is a resource
> -      * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
> -      * long as the dpdk does not provide a way to deallocate a ethdev and a
> -      * way to enumerate the registered ethdevs to free the previous ones.
> -      */
>  error:
>       if (ctx)
>               claim_zero(mlx5_glue->close_device(ctx));
> -     if (list)
> -             mlx5_glue->free_device_list(list);
> -     if (err) {
> -             rte_errno = err;
> +     assert(err > 0);
> +     rte_errno = err;
> +     return NULL;
> +}
> +
> +/**
> + * Spawn Ethernet devices from Verbs information, one per detected port.
> + *
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + *
> + * @return
> + *   A NULL-terminated list of Ethernet device objects on success, NULL
> + *   otherwise and rte_errno is set. Caller is expected to release list
> + *   memory through free().
> + */
> +static struct rte_eth_dev **
> +mlx5_dev_spawn(struct rte_device *dpdk_dev,
> +            struct ibv_device *ibv_dev,
> +            int vf)
> +{
> +     struct rte_eth_dev **eth_list = NULL;
> +     struct ibv_context *ctx;
> +     struct ibv_device_attr_ex attr;
> +     unsigned int i;
> +     int ret;
> +
> +     errno = 0;
> +     ctx = mlx5_glue->open_device(ibv_dev);
> +     if (!ctx) {
> +             rte_errno = errno ? errno : ENODEV;
> +             if (rte_errno == ENODEV)
> +                     DRV_LOG(ERR,
> +                             "cannot access device, is mlx5_ib loaded?");
> +             else
> +                     DRV_LOG(ERR,
> +                             "cannot use device, are drivers up to date?");
> +             return NULL;
> +     }
> +     ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> +     mlx5_glue->close_device(ctx);
> +     if (ret) {
> +             rte_errno = ret;
> +             DRV_LOG(ERR, "unable to query device information: %s",
> +                     strerror(rte_errno));
> +             return NULL;
> +     }
> +     DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +     eth_list = malloc(sizeof(*eth_list) *
> +                       (attr.orig_attr.phys_port_cnt + 1));
> +     if (!eth_list) {
> +             rte_errno = errno;
> +             return NULL;
> +     }
> +     for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> +             eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> +                                              &attr, i + 1);
> +             if (eth_list[i])
> +                     continue;
> +             /* Save rte_errno and roll back in case of failure. */
> +             ret = rte_errno;
> +             while (i--) {
> +                     mlx5_dev_close(eth_list[i]);
> +                     if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +                             rte_free(eth_list[i]->data->dev_private);
> +                     claim_zero(rte_eth_dev_release_port(eth_list[i]));
> +             }
> +             free(eth_list);
> +             rte_errno = ret;
> +             return NULL;
> +     }
> +     eth_list[i] = NULL;
> +     return eth_list;
> +}
> +
> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +            struct rte_pci_device *pci_dev) {
> +     struct ibv_device **ibv_list;
> +     struct rte_eth_dev **eth_list = NULL;
> +     int vf;
> +     int ret;
> +
> +     assert(pci_drv == &mlx5_driver);
> +     switch (pci_dev->id.device_id) {
> +     case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +     case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +     case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +     case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +             vf = 1;
> +             break;
> +     default:
> +             vf = 0;
> +     }
> +     errno = 0;
> +     ibv_list = mlx5_glue->get_device_list(&ret);
> +     if (!ibv_list) {
> +             rte_errno = errno ? errno : ENOSYS;
> +             DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>               return -rte_errno;
>       }
> -     return 0;
> +     while (ret-- > 0) {
> +             struct rte_pci_addr pci_addr;
> +
> +             DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
> +             if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +                     continue;
> +             if (pci_dev->addr.domain != pci_addr.domain ||
> +                 pci_dev->addr.bus != pci_addr.bus ||
> +                 pci_dev->addr.devid != pci_addr.devid ||
> +                 pci_dev->addr.function != pci_addr.function)
> +                     continue;
> +             DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> +                     ibv_list[ret]->name);
> +             break;
> +     }
> +     if (ret >= 0)
> +             eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> +     mlx5_glue->free_device_list(ibv_list);
> +     if (!ret) {
> +             DRV_LOG(WARNING,
> +                     "no Verbs device matches PCI device " PCI_PRI_FMT ","
> +                     " are kernel drivers loaded?",
> +                     pci_dev->addr.domain, pci_dev->addr.bus,
> +                     pci_dev->addr.devid, pci_dev->addr.function);
> +             rte_errno = ENOENT;
> +             ret = -rte_errno;
> +     } else if (!eth_list || !*eth_list) {
> +             DRV_LOG(ERR,
> +                     "probe of PCI device " PCI_PRI_FMT " aborted after"
> +                     " encountering an error: %s",
> +                     pci_dev->addr.domain, pci_dev->addr.bus,
> +                     pci_dev->addr.devid, pci_dev->addr.function,
> +                     strerror(rte_errno));
> +             ret = -rte_errno;
> +     } else {
> +             for (ret = 0; eth_list[ret]; ++ret) {
> +                     rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +                     rte_eth_dev_probing_finish(eth_list[ret]);
> +             }
> +             ret = 0;
> +     }
> +     free(eth_list);
> +     return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0

Reply via email to