Re: [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe code
> -Original Message- > From: dev On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 1/7] net/mlx5: rename confusing object in probe > code > > There are several attribute objects in this function: > > - IB device attributes (struct ibv_device_attr_ex device_attr). > - Direct Verbs attributes (struct mlx5dv_context attrs_out). > - Port attributes (struct ibv_port_attr). > - IB device attributes again (struct ibv_device_attr_ex device_attr_ex). > > "attrs_out" is both odd and initialized using a nonstandard syntax. Rename it > "dv_attr" for > consistency. > > Signed-off-by: Adrien Mazarguil > -- > v2 changes: > > - Fixed ctx -> attr_ctx in mlx5_pci_probe(). > --- > drivers/net/mlx5/mlx5.c | 34 +- > 1 file changed, 17 insertions(+), 17 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 3e0a1b186..3bdcb3970 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -654,6 +654,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, { > struct ibv_device **list = NULL; > struct ibv_device *ibv_dev; > + struct mlx5dv_context dv_attr = { .comp_mask = 0 }; > int err = 0; > struct ibv_context *attr_ctx = NULL; > struct ibv_device_attr_ex device_attr; @@ -670,7 +671,6 @@ > mlx5_pci_probe(struct rte_pci_driver > *pci_drv __rte_unused, > unsigned int mprq_min_stride_num_n = 0; > unsigned int mprq_max_stride_num_n = 0; > int i; > - struct mlx5dv_context attrs_out = {0}; > #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT > struct ibv_counter_set_description cs_desc = { .counter_type = 0 }; > #endif @@ -736,21 +736,21 > @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > ibv_dev = list[i]; > DRV_LOG(DEBUG, "device opened"); > #ifdef HAVE_IBV_MLX5_MOD_SWP > - attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; > + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; > #endif > /* >* Multi-packet send is supported by ConnectX-4 Lx PF as well >* as all ConnectX-5 devices. >*/ > #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT > - attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; > + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; > #endif > #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT > - attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; > + dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; > #endif > - mlx5_glue->dv_query_device(attr_ctx, &attrs_out); > - if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { > - if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { > + mlx5_glue->dv_query_device(attr_ctx, &dv_attr); > + if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { > + if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { > DRV_LOG(DEBUG, "enhanced MPW is supported"); > mps = MLX5_MPW_ENHANCED; > } else { > @@ -762,14 +762,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > mps = MLX5_MPW_DISABLED; > } > #ifdef HAVE_IBV_MLX5_MOD_SWP > - if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP) > - swp = attrs_out.sw_parsing_caps.sw_parsing_offloads; > + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) > + swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; > DRV_LOG(DEBUG, "SWP support: %u", swp); #endif #ifdef > HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT > - if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { > + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { > struct mlx5dv_striding_rq_caps mprq_caps = > - attrs_out.striding_rq_caps; > + dv_attr.striding_rq_caps; > > DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", > mprq_caps.min_single_stride_log_num_of_bytes); > @@ -794,15 +794,15 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > } > #endif > if (RTE_CACHE_LINE_SIZE == 128 && > - !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) > + !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) > cqe_comp = 0; > else > cqe_comp = 1; > #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT > - if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { > - tunnel_en = ((attrs_out.tunnel_offloads_caps & > + if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { > + tunnel_en = ((dv_attr.tunnel_offloads_caps & > MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && > - (attrs_out.tunnel_offloads_caps & > + (dv_attr.tunnel_offloads_caps & > MLX5DV_R
Re: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in probe code
> -Original Message- > From: dev On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 2/7] net/mlx5: remove redundant objects in > probe code > > This patch gets rid of redundant calls to open the device and query its > attributes in order to > simplify the code. > > Signed-off-by: Adrien Mazarguil > -- > v2 changes: > > - Minor indent fix on existing code. > --- > drivers/net/mlx5/mlx5.c | 64 +--- > 1 file changed, 30 insertions(+), 34 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 3bdcb3970..1a5391e63 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -654,10 +654,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, { > struct ibv_device **list = NULL; > struct ibv_device *ibv_dev; > + struct ibv_context *ctx = NULL; > + struct ibv_device_attr_ex attr; > struct mlx5dv_context dv_attr = { .comp_mask = 0 }; > int err = 0; > - struct ibv_context *attr_ctx = NULL; > - struct ibv_device_attr_ex device_attr; > unsigned int vf = 0; > unsigned int mps; > unsigned int cqe_comp; > @@ -714,12 +714,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) || > (pci_dev->id.device_id == > PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)); > - attr_ctx = mlx5_glue->open_device(list[i]); > + ctx = mlx5_glue->open_device(list[i]); > rte_errno = errno; > err = rte_errno; > break; > } > - if (attr_ctx == NULL) { > + if (ctx == NULL) { > switch (err) { > case 0: > DRV_LOG(ERR, > @@ -748,7 +748,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, #ifdef > HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT > dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; #endif > - mlx5_glue->dv_query_device(attr_ctx, &dv_attr); > + mlx5_glue->dv_query_device(ctx, &dv_attr); > if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { > if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { > DRV_LOG(DEBUG, "enhanced MPW is supported"); @@ -822,23 > +822,20 @@ > mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" > " old OFED/rdma-core version or firmware configuration"); > #endif > - err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr); > + err = mlx5_glue->query_device_ex(ctx, NULL, &attr); > if (err) { > DEBUG("ibv_query_device_ex() failed"); > goto error; > } > - DRV_LOG(INFO, "%u port(s) detected", > - device_attr.orig_attr.phys_port_cnt); > - for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) { > + DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt); > + for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) { > char name[RTE_ETH_NAME_MAX_LEN]; > int len; > uint32_t port = i + 1; /* ports are indexed from one */ > - struct ibv_context *ctx = NULL; > struct ibv_port_attr port_attr; > struct ibv_pd *pd = NULL; > struct priv *priv = NULL; > struct rte_eth_dev *eth_dev = NULL; > - struct ibv_device_attr_ex device_attr_ex; > struct ether_addr mac; > struct mlx5_dev_config config = { > .cqe_comp = cqe_comp, > @@ -865,7 +862,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > len = snprintf(name, sizeof(name), PCI_PRI_FMT, >pci_dev->addr.domain, pci_dev->addr.bus, >pci_dev->addr.devid, pci_dev->addr.function); > - if (device_attr.orig_attr.phys_port_cnt > 1) > + if (attr.orig_attr.phys_port_cnt > 1) > snprintf(name + len, sizeof(name), " port %u", i); > if (rte_eal_process_type() == RTE_PROC_SECONDARY) { > eth_dev = rte_eth_dev_attach_secondary(name); > @@ -907,7 +904,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > continue; > } > DRV_LOG(DEBUG, "using port %u", port); > - ctx = mlx5_glue->open_device(ibv_dev); > + if (!ctx) > + ctx = mlx5_glue->open_device(ibv_dev); > if (ctx == NULL) { > err = ENODEV; > goto port_error; > @@ -949,7 +947,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > priv->ctx = ctx; > strncpy(priv-
Re: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
Reviewed-by: Xueming Li > -Original Message- > From: dev On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing > code > > All the generic probing code needs is an IB device. While this device is > currently supplied by a PCI > lookup, other methods will be added soon. > > This patch divides the original function, which has become huge over time, as > follows: > > 1. PCI-specific (mlx5_pci_probe()). > 2. All ports of a Verbs device (mlx5_dev_spawn()). > 3. A given port of a Verbs device (mlx5_dev_spawn_one()). > > (Patch based on prior work from Yuanhan Liu) > > Signed-off-by: Adrien Mazarguil > -- > v2 changes: > > - Fixed device naming. A port suffix is now appended only if several IB > ports happen to be detected. > - Added separate message to distinguish missing kernel drivers from other > initialization errors, as it was confusing. > --- > drivers/net/mlx5/mlx5.c | 340 ++- > 1 file changed, 209 insertions(+), 131 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 1a5391e63..01dcf25b9 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) } > > /** > - * DPDK callback to register a PCI device. > - * > - * This function creates an Ethernet device for each port of a given > - * PCI device. > + * Spawn an Ethernet device from Verbs information. > * > - * @param[in] pci_drv > - * PCI driver structure (mlx5_driver). > - * @param[in] pci_dev > - * PCI device information. > + * @param dpdk_dev > + * Backing DPDK device. > + * @param ibv_dev > + * Verbs device. > + * @param vf > + * If nonzero, enable VF-specific features. > + * @param[in] attr > + * Verbs device attributes. > + * @param port > + * Verbs port to use (indexed from 1). > * > * @return > - * 0 on success, a negative errno value otherwise and rte_errno is set. > + * A valid Ethernet device object on success, NULL otherwise and rte_errno > + * is set. > */ > -static int > -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, > -struct rte_pci_device *pci_dev) > +static struct rte_eth_dev * > +mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > +struct ibv_device *ibv_dev, > +int vf, > +const struct ibv_device_attr_ex *attr, > +unsigned int port) > { > - struct ibv_device **list = NULL; > - struct ibv_device *ibv_dev; > - struct ibv_context *ctx = NULL; > - struct ibv_device_attr_ex attr; > + struct ibv_context *ctx; > struct mlx5dv_context dv_attr = { .comp_mask = 0 }; > + struct rte_eth_dev *eth_dev = NULL; > int err = 0; > - unsigned int vf = 0; > unsigned int mps; > unsigned int cqe_comp; > unsigned int tunnel_en = 0; > @@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > unsigned int mprq_max_stride_size_n = 0; > unsigned int mprq_min_stride_num_n = 0; > unsigned int mprq_max_stride_num_n = 0; > - int i; > #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT > struct ibv_counter_set_description cs_desc = { .counter_type = 0 }; > #endif > > /* Prepare shared data between primary and secondary process. */ > mlx5_prepare_shared_data(); > - assert(pci_drv == &mlx5_driver); > - list = mlx5_glue->get_device_list(&i); > - if (list == NULL) { > - assert(errno); > - err = errno; > - if (errno == ENOSYS) > - DRV_LOG(ERR, > - "cannot list devices, is ib_uverbs loaded?"); > - goto error; > - } > - assert(i >= 0); > - /* > - * For each listed device, check related sysfs entry against > - * the provided PCI ID. > - */ > - while (i != 0) { > - struct rte_pci_addr pci_addr; > - > - --i; > - DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name); > - if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) > - continue; > - if ((pci_dev->addr.domain != pci_addr.domain) || > - (pci_dev->addr.bus != pci_addr.bus) || > - (pci_dev->addr.devid != pci_addr.devid) || > - (pci_dev->addr.function != pci_addr.function)) > - continue; > - DRV_LOG(INFO, "PCI information matches, using device \"%s\"", > - list[i]->name); > - vf = ((pci_dev->id.device_id == > -PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || > - (pci_dev->id.device_id == > -PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) || > - (pci_dev->id.device_id == > -
Re: [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness
Reviewed-by: Xueming Li One minor issue we should be able to ignore. > -Original Message- > From: dev On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 5/7] net/mlx5: add port representor awareness > > The current PCI probing method is not aware of Verbs port representors, which > appear as standard Verbs > devices bound to the same PCI address and cannot be distinguished. > > Problem is that more often than not, the wrong Verbs device is used, > resulting in unexpected traffic. > > This patch adds necessary heuristics to bind affected driver instances to the > intended (i.e. non- > representor) device. > > (Patch based on prior work from Yuanhan Liu) > > Signed-off-by: Adrien Mazarguil > -- > v2 changes: > > - Fixed digit detection in mlx5_cmp_ibv_name() so that "foo1" and "foo10" > are compared on the integer conversion of "1" against "10" instead of "" > and "0". > --- > drivers/net/mlx5/mlx5.c | 66 > 1 file changed, 61 insertions(+), 5 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > c9815d721..498f80c89 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -3,6 +3,7 @@ > * Copyright 2015 Mellanox Technologies, Ltd > */ > > +#include > #include > #include > #include > @@ -1170,6 +1171,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, } > > /** > + * Comparison callback to sort Verbs device names. > + * > + * This is meant to be used with qsort(). > + * > + * @param a[in] > + * Pointer to pointer to first Verbs device. > + * @param b[in] > + * Pointer to pointer to second Verbs device. > + * > + * @return > + * 0 if both names are equal, less than 0 if the first argument is less > + * than the second, greater than 0 otherwise. > + */ > +static int > +mlx5_cmp_ibv_name(const void *a, const void *b) { > + const char *name_a = (*(const struct ibv_device *const *)a)->name; > + const char *name_b = (*(const struct ibv_device *const *)b)->name; > + size_t i = 0; > + > + while (name_a[i] && name_a[i] == name_b[i]) > + ++i; > + while (i && isdigit(name_a[i - 1]) && isdigit(name_b[i - 1])) name_a[i - 1] and name_b[i - 1] must be same here. > + --i; > + return atoi(name_a + i) - atoi(name_b + i); } > + > +/** > * DPDK callback to register a PCI device. > * > * This function creates an Ethernet device for each port of a given @@ > -1189,6 +1218,7 @@ > mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, { > struct ibv_device **ibv_list; > struct rte_eth_dev **eth_list = NULL; > + int n = 0; > int vf; > int ret; > > @@ -1210,6 +1240,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); > return -rte_errno; > } > + > + struct ibv_device *ibv_match[ret + 1]; > + > while (ret-- > 0) { > struct rte_pci_addr pci_addr; > > @@ -1221,14 +1254,37 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv > __rte_unused, > pci_dev->addr.devid != pci_addr.devid || > pci_dev->addr.function != pci_addr.function) > continue; > - DRV_LOG(INFO, "PCI information matches, using device \"%s\"", > + DRV_LOG(INFO, "PCI information matches for device \"%s\"", > ibv_list[ret]->name); > - break; > + ibv_match[n++] = ibv_list[ret]; > + } > + ibv_match[n] = NULL; > + if (n > 1) { > + /* > + * The existence of several matching entries means port > + * representors have been instantiated. No existing Verbs > + * call nor /sys entries can tell them apart at this point. > + * > + * While definitely hackish, assume their names are numbered > + * based on order of creation with master device first, > + * followed by first port representor, followed by the > + * second one and so on. > + */ > + DRV_LOG(WARNING, > + "probing device with port representors involves" > + " heuristics with uncertain outcome"); > + qsort(ibv_match, n, sizeof(*ibv_match), mlx5_cmp_ibv_name); > + DRV_LOG(WARNING, "assuming \"%s\" is the master device", > + ibv_match[0]->name); > + for (ret = 1; ret < n; ++ret) > + DRV_LOG(WARNING, > + "assuming \"%s\" is port representor #%d", > + ibv_match[ret]->name, ret - 1); > } > - if (ret >= 0) > - eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf); > + if (n) > + eth_list = mlx
Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
Reviewed-by: Xueming Li Minor comments inside: > -Original Message- > From: dev On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors > > Probe existing port representors in addition to their master device and > associate them automatically. > > To avoid name collision between Ethernet devices, their names use the same > convention as ixgbe and > i40e PMDs, that is, instead of only a PCI address in DBDF notation: > > - "net_{DBDF}_0" for master/switch devices. > - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port > representors. > > Both optionally suffixed with "_port_{num}" instead of " port {num}" for > devices that expose several > Verbs ports (note this is never the case on mlx5, but kept for historical > reasons for the time being). > > (Patch based on prior work from Yuanhan Liu) > > Signed-off-by: Adrien Mazarguil > -- > v2 changes: > > - Added representor information to dev_infos_get(). DPDK port ID of master > device is now stored in the private structure to retrieve it > conveniently. > - Master device is assigned dummy representor ID value -1 to better > distinguish from the the first actual representor reported by > dev_infos_get() as those are indexed from 0. > - Added RTE_ETH_DEV_REPRESENTOR device flag. > --- > drivers/net/mlx5/mlx5.c| 138 > drivers/net/mlx5/mlx5.h| 9 ++- > drivers/net/mlx5/mlx5_ethdev.c | 151 > drivers/net/mlx5/mlx5_mac.c| 2 +- > drivers/net/mlx5/mlx5_stats.c | 6 +- > 5 files changed, 252 insertions(+), 54 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 498f80c89..716c9d9a5 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev) > if (ret) > DRV_LOG(WARNING, "port %u some flows still remain", > dev->data->port_id); > + if (!priv->representor && > + priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) > + claim_zero(rte_eth_switch_domain_free(priv->domain_id)); > memset(priv, 0, sizeof(*priv)); > } > > @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) > * Verbs device attributes. > * @param port > * Verbs port to use (indexed from 1). > + * @param master > + * Master device in case @p ibv_dev is a port representor. > + * @param rep_id > + * Representor identifier when @p master is non-NULL. > * > * @return > * A valid Ethernet device object on success, NULL otherwise and rte_errno > @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > struct ibv_device *ibv_dev, > int vf, > const struct ibv_device_attr_ex *attr, > -unsigned int port) > +unsigned int port, > +struct rte_eth_dev *master, > +unsigned int rep_id) > { > struct ibv_context *ctx; > struct ibv_port_attr port_attr; > @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > " old OFED/rdma-core version or firmware configuration"); > #endif > config.mpls_en = mpls_en; > - if (attr->orig_attr.phys_port_cnt > 1) > - snprintf(name, sizeof(name), "%s port %u", > - dpdk_dev->name, port); > + if (!master) > + snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name); > else > - snprintf(name, sizeof(name), "%s", dpdk_dev->name); > + snprintf(name, sizeof(name), "net_%s_representor_%u", > + dpdk_dev->name, rep_id); > + if (attr->orig_attr.phys_port_cnt > 1) > + snprintf(name, sizeof(name), "%s_port_%u", name, port); > + DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); > if (rte_eal_process_type() == RTE_PROC_SECONDARY) { > eth_dev = rte_eth_dev_attach_secondary(name); > if (eth_dev == NULL) { > @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > priv->port = port; > priv->pd = pd; > priv->mtu = ETHER_MTU; > + /* > + * Allocate a switch domain for master devices and share it with > + * port representors. > + */ > + if (!master) { > + priv->representor = 0; > + priv->master_id = -1; /* Updated once known. */ > + priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; Domain_id will override below. > + priv->rep_id = -1; /* Dummy unique value. */ > + err = rte_eth_switch_domain_alloc(&priv->domain_id); > + if (err) { > + err = rte_errno; > + DRV_LOG(ERR, "unable to allocate switch domain: %s", > +
Re: [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for port representors
Reviewed-by: Xueming Li > -Original Message- > From: dev On Behalf Of Adrien Mazarguil > Sent: Thursday, June 14, 2018 4:35 PM > To: Shahaf Shuler > Cc: dev@dpdk.org > Subject: [dpdk-dev] [PATCH v2 7/7] net/mlx5: add parameter for port > representors > > Prior to this patch, all port representors detected on a given device were > probed and Ethernet devices > instantiated for each of them. > > This patch adds support for the standard "representor" parameter, which > implies that port representors > are not probed by default anymore, except for the list provided through > device arguments. > > (Patch based on prior work from Yuanhan Liu) > > Signed-off-by: Adrien Mazarguil > -- > v2 changes: > > - Added error message for when rte_eth_devargs_parse() fails. > --- > doc/guides/nics/mlx5.rst| 12 > doc/guides/prog_guide/poll_mode_drv.rst | 2 ++ > drivers/net/mlx5/mlx5.c | 29 > 3 files changed, 43 insertions(+) > > diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index > 79c982e29..5229e546c 100644 > --- a/doc/guides/nics/mlx5.rst > +++ b/doc/guides/nics/mlx5.rst > @@ -388,6 +388,18 @@ Run-time configuration > >Disabled by default. > > +- ``representor`` parameter [list] > + > + This parameter can be used to instantiate DPDK Ethernet devices from > + existing port (or VF) representors configured on the device. > + > + It is a standard parameter whose format is described in > + :ref:`ethernet_device_standard_device_arguments`. > + > + For instance, to probe port representors 0 through 2:: > + > +representor=[0-2] > + > Firmware configuration > ~~ > > diff --git a/doc/guides/prog_guide/poll_mode_drv.rst > b/doc/guides/prog_guide/poll_mode_drv.rst > index af82352a0..58d49ba0f 100644 > --- a/doc/guides/prog_guide/poll_mode_drv.rst > +++ b/doc/guides/prog_guide/poll_mode_drv.rst > @@ -365,6 +365,8 @@ Ethernet Device API > > The Ethernet device API exported by the Ethernet PMDs is described in the > *DPDK API Reference*. > > +.. _ethernet_device_standard_device_arguments: > + > Ethernet Device Standard Device Arguments > ~ > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 716c9d9a5..26e61d99d 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -91,6 +91,9 @@ > /* Activate Netlink support in VF mode. */ #define MLX5_VF_NL_EN "vf_nl_en" > > +/* Select port representors to instantiate. */ #define MLX5_REPRESENTOR > +"representor" > + > #ifndef HAVE_IBV_MLX5_MOD_MPW > #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) #define > MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) > @@ -423,6 +426,9 @@ mlx5_args_check(const char *key, const char *val, void > *opaque) > struct mlx5_dev_config *config = opaque; > unsigned long tmp; > > + /* No-op, port representors are processed in mlx5_dev_spawn(). */ > + if (!strcmp(MLX5_REPRESENTOR, key)) > + return 0; > errno = 0; > tmp = strtoul(val, NULL, 0); > if (errno) { > @@ -495,6 +501,7 @@ mlx5_args(struct mlx5_dev_config *config, struct > rte_devargs *devargs) > MLX5_RX_VEC_EN, > MLX5_L3_VXLAN_EN, > MLX5_VF_NL_EN, > + MLX5_REPRESENTOR, > NULL, > }; > struct rte_kvargs *kvlist; > @@ -1173,13 +1180,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, > struct rte_eth_dev **eth_list = NULL; > struct ibv_context *ctx; > struct ibv_device_attr_ex attr; > + struct rte_eth_devargs eth_da; > void *tmp; > unsigned int i; > unsigned int j = 0; > unsigned int n = 0; > int ret; > > + if (dpdk_dev->devargs) { > + ret = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); > + if (ret) { > + rte_errno = -ret; > + DRV_LOG(ERR, "failed to process device arguments: %s", > + strerror(rte_errno)); > + goto error; > + } > + } else { > + memset(ð_da, 0, sizeof(eth_da)); > + } > next: > + if (j) { > + unsigned int k; > + > + for (k = 0; k < eth_da.nb_representor_ports; ++k) > + if (eth_da.representor_ports[k] == j - 1) > + break; > + if (k == eth_da.nb_representor_ports) > + goto skip; > + } > errno = 0; > ctx = mlx5_glue->open_device(ibv_dev[j]); > if (!ctx) { > @@ -1218,6 +1246,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, > goto error; > ++n; > } > +skip: > if (ibv_dev[++j]) > goto next; > eth_list[n] = NULL; > -- > 2.11.0
Re: [dpdk-dev] [PATCH 2/3] app/testpmd: enable UDP GSO in the checksum forwarding engine
Hi Bernard, > -Original Message- > From: Iremonger, Bernard > Sent: Thursday, June 14, 2018 10:44 PM > To: Hu, Jiayu ; dev@dpdk.org > Cc: Ananyev, Konstantin ; Hu, Jiayu > > Subject: RE: [dpdk-dev] [PATCH 2/3] app/testpmd: enable UDP GSO in the > checksum forwarding engine > > Hi Jiayu, > > > -Original Message- > > From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Jiayu Hu > > Sent: Tuesday, May 29, 2018 8:41 AM > > To: dev@dpdk.org > > Cc: Ananyev, Konstantin ; Hu, Jiayu > > > > Subject: [dpdk-dev] [PATCH 2/3] app/testpmd: enable UDP GSO in the > > checksum forwarding engine > > > > This patch enables GSO for UDP/IPv4 packets. Oversized UDP/IPv4 > packets > > transmitted over a GSO-enabled port will undergo segmentation. > > > > Signed-off-by: Jiayu Hu > > --- > > app/test-pmd/cmdline.c | 5 +++-- > > app/test-pmd/csumonly.c | 2 ++ > > app/test-pmd/testpmd.c | 2 +- > > 3 files changed, 6 insertions(+), 3 deletions(-) > > > > diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index > > 27e2aa8..4239e91 100644 > > --- a/app/test-pmd/cmdline.c > > +++ b/app/test-pmd/cmdline.c > > @@ -4792,8 +4792,9 @@ cmd_gso_show_parsed(void *parsed_result, > > if (gso_ports[res->cmd_pid].enable) { > > printf("Max GSO'd packet size: %uB\n" > > "Supported GSO types: TCP/IPv4, " > > - "VxLAN with inner TCP/IPv4 packet, > " > > - "GRE with inner TCP/IPv4 > packet\n", > > + "UDP/IPv4, VxLAN with inner " > > + "TCP/IPv4 packet, GRE with inner " > > + "TCP/IPv4 packet\n", > > This change is giving 3 checkpatch.pl warnings and should be refactored The file test-pmd/cmdline.c uses the above coding style to split long strings, so I think I need to keep this style. But when I use it, there are always some warnings. > > > > gso_max_segment_size); > > } else > > printf("GSO is not enabled on Port %u\n", res- > > >cmd_pid); diff --git a/app/test-pmd/csumonly.c b/app/test- > > pmd/csumonly.c index 0bb88cf..4948292 100644 > > --- a/app/test-pmd/csumonly.c > > +++ b/app/test-pmd/csumonly.c > > @@ -411,6 +411,8 @@ process_inner_cksums(void *l3_hdr, const struct > > testpmd_offload_info *info, > > info->ethertype); > > } > > } > > + if (info->gso_enable) > > + ol_flags |= PKT_TX_UDP_SEG; > > } else if (info->l4_proto == IPPROTO_TCP) { > > tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + info->l3_len); > > tcp_hdr->cksum = 0; > > diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index > > 35cf266..b5766ff 100644 > > --- a/app/test-pmd/testpmd.c > > +++ b/app/test-pmd/testpmd.c > > @@ -777,7 +777,7 @@ init_config(void) > > init_port_config(); > > > > gso_types = DEV_TX_OFFLOAD_TCP_TSO | > > DEV_TX_OFFLOAD_VXLAN_TNL_TSO | > > - DEV_TX_OFFLOAD_GRE_TNL_TSO; > > + DEV_TX_OFFLOAD_GRE_TNL_TSO | > > DEV_TX_OFFLOAD_UDP_TSO; > > /* > > * Records which Mbuf pool to use by each logical core, if needed. > > */ > > -- > > 2.7.4 > > ./devtools/check-git-log.sh -1 > Headline too long: > app/testpmd: enable UDP GSO in the checksum forwarding engine Will change it in the next version. > > Should the dpdk/doc/guides/testpmd_ap_ug/testpmd_funcs.rst file be > updated? I will add some descriptions about UDP/IPv4 GSO usage guide in it. Thanks, Jiayu > > Regards, > > Bernard >
Re: [dpdk-dev] [PATCH] net/bonding: fix link properties with autoneg
On Thu, Jun 14, 2018 at 1:04 PM Ferruh Yigit wrote: > On 4/16/2018 8:09 PM, Matan Azrad wrote: > > Hi Chas > > > > From: Chas Williams, Monday, April 16, 2018 7:44 PM > >> On Mon, Apr 16, 2018 at 4:06 AM, Matan Azrad > >> wrote: > >>> Hi Chas > >>> > >>> From: Chas Williams, Wednesday, February 14, 2018 12:55 AM > If a link is carrier down and using autonegotiation, then the PMD may > not have detected a speed yet. In this case the best we can do is > ignore the link speed and duplex since they aren't valid. > >>> > >>> Ok for this. > >>> > To be completely correct, there > should be additional checks to prevent a slave that negotiates a > different speed from being activated. > >>> > >>> Looks like every changing in the link properties should cause LSC > interrupt. > >>> In the bonding LCS interrupt you could handle and to deactivate the > device. > >>> Also you should deal with the case of the first slave, what is happen > if the > >> first slave has invalid link properties? > >>> How can you know that the speed\duplex_mode is invalid? > >>> Are we sure LACP mode can run with auto negotiation? > >> > >> Yes, I am pretty sure bonding doesn't get this right when the interfaces > >> aren't link up. While what bonding is doing is likely wrong, it > doesn't mean > >> that the behavior of the PMDs are correct in leaving the link_status > unset > >> until the first LSC interrupt. > >> > >> I plan to get around to looking at this bonding problem in a little > bit. Luckily it > >> seems that we always tend to get matched links and even if bonding is > >> advertising the wrong aggregate speed and duplex we are find for now. > It > >> wouldn't pass close inspection by a protocol analyzer though. > >> > > > > So, Are you going to fix it, > > If no, I think you can open a bug in Bugzilla. > > Hi Matan, Chas, > > What is the latest status of the patch? > And I guess there is another issue as well discussed here, is it still > valid? > > Thanks, > ferruh > I think this issue is better addressed by http://dpdk.org/dev/patchwork/patch/40572/ There's just a little more cleanup that needs to be done in that patch.
Re: [dpdk-dev] [PATCH v2] net/pcap: rx_iface_in stream type support
Is pcap_sendpacket() to the same pcap_t handle thread-safe? I couldn't find clear answer so I'd rather assume not. If it's not thread-safe then supporting multiple "iface"'s will require multiple pcap_open_live()'s and we are back in the same place. >> I am not sure exiting behavior is intentional, which is capturing sent >> packages in Rx pcap handler to same interface. >> Are you aware of any use case of existing behavior? Perhaps it can be option >> to set PCAP_D_IN by default for rx_iface argument. Even if unintentional I find it very useful for testing, as this way it's very easy to send traffic to the app by tcpreplay on the same host the app is running on. Using tcpreplay is in the out direction that will not be captured if PCAP_D_IN is set. If PCAP_D_IN is the only option then it will require external device (or some networking trick) to send packets to the app. So, I'd say it is good for testing but less good for real functionality -Original Message- From: Ferruh Yigit Sent: Friday, June 15, 2018 3:53 PM To: Ido Goshen Cc: dev@dpdk.org Subject: Re: [PATCH v2] net/pcap: rx_iface_in stream type support On 6/14/2018 9:44 PM, Ido Goshen wrote: > I think we are starting to mix two things One is how to configure pcap > eth dev with multiple queues and I totally agree it would have been nicer to > just say something like "max_tx_queues =N" instead of needing to write > "tx_iface" N times, but as it was already supported in that way (for any > reason?) I wasn't trying to enhance or change it. > The other issue is pcap direction API, which I was trying to expose to users > of dpdk pcap device. Hi Ido, Assuming "iface" argument solves the direction issue, I am suggestion adding multiqueue support to "iface" argument as a solution to your problem. I am not suggesting using new arguments like "max_tx_queues =N", "iface" can be used same as how rx/tx_ifcase used now, provide it multiple times. > Refer to https://www.tcpdump.org/manpages/pcap_setdirection.3pcap.txt > or man tcpdump for -P/--direction in|out|inout option, Actually I > think a more realistic emulation of a physical device (non-virtual) > would be to capture only the incoming direction (set PCAP_D_IN), again > the existing behavior is very useful too and I didn't try to change or > eliminate it but just add additional stream type option I am not sure exiting behavior is intentional, which is capturing sent packages in Rx pcap handler to same interface. Are you aware of any use case of existing behavior? Perhaps it can be option to set PCAP_D_IN by default for rx_iface argument. > > -Original Message- > From: Ferruh Yigit > Sent: Thursday, June 14, 2018 9:09 PM > To: Ido Goshen > Cc: dev@dpdk.org > Subject: Re: [PATCH v2] net/pcap: rx_iface_in stream type support > > On 6/14/2018 6:14 PM, Ido Goshen wrote: >> I use "rx_iface","tx_iface" (and not just "iface") in order to have >> multiple TX queues I just gave a simplified setting with 1 queue My >> app does a full mesh between the ports (not fixed pairs like l2fwd) >> so all the forwarding lcores can tx to the same port simultaneously and as >> DPDK docs say: >> "Multiple logical cores should never share receive or transmit queues for >> interfaces since this would require global locks and hinder performance." >> For example if I have 3 ports handled by 3 cores it'll be >> myapp -c 7 -n1 --no-huge \ >> >> --vdev=eth_pcap0,rx_iface=eth0,tx_iface=eth0,tx_iface=eth0,tx_iface=eth0 \ >> >> --vdev=eth_pcap0,rx_iface=eth1,tx_iface=eth1,tx_iface=eth1,tx_iface=eth1 \ >> >> --vdev=eth_pcap0,rx_iface=eth2,tx_iface=eth2,tx_iface=eth2,tx_iface=eth2 \ >> -- -p 7 >> Is there another way to achieve multiple queues in pcap vdev? > > If you want to use multiple core you need multiple queues, as you said, and > above is the way to create multiple queues for pcap. > > Currently "iface" argument only supports single interface in a hardcoded way, > but technically it should be possible to update it to support multiple queue. > > So if "iface" arguments works for you, it can be better to add multi queue > support to "iface" instead of introducing a new device argument. > >> >> I do see that using "iface" behaves differently - I'll try to >> investigate why > > pcap_open_live() is called for both arguments, for "rx_iface/tx_iface" pair > it has been called twice one for each. Not sure if pcap library returns same > handler or two different handlers for this case since iface name is same. > For "iface" argument pcap_open_live() called once, so we have single handler > for both Rx & Tx. This may be difference. > >> And still even when using "iface" I also see packets that are >> transmitted out of eth1 (e.g. tcpreplay -i eth1 packets.pcap) and not >> only packets that are received (e.g. ping from far end to eth0 ip) > > This is interesting, I have tried with external packet generator, "iface" was > working as expecte
[dpdk-dev] [PATCH 2/2] net/pcap: duplicate code consolidation
Signed-off-by: ido goshen --- drivers/net/pcap/rte_eth_pcap.c | 58 - 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/net/pcap/rte_eth_pcap.c b/drivers/net/pcap/rte_eth_pcap.c index 444abbb..e63998b 100644 --- a/drivers/net/pcap/rte_eth_pcap.c +++ b/drivers/net/pcap/rte_eth_pcap.c @@ -658,6 +658,20 @@ struct pmd_devargs { .stats_reset = eth_stats_reset, }; +static int +add_queue(struct pmd_devargs *pmd, const char *name, const char *type, + pcap_t *pcap, pcap_dumper_t *dumper) +{ + if (pcap) + pmd->queue[pmd->num_of_queue].pcap = pcap; + if (dumper) + pmd->queue[pmd->num_of_queue].dumper = dumper; + pmd->queue[pmd->num_of_queue].name = name; + pmd->queue[pmd->num_of_queue].type = type; + pmd->num_of_queue++; + return 0; +} + /* * Function handler that opens the pcap file for reading a stores a * reference of it for use it later on. @@ -672,10 +686,7 @@ struct pmd_devargs { if (open_single_rx_pcap(pcap_filename, &pcap) < 0) return -1; - rx->queue[rx->num_of_queue].pcap = pcap; - rx->queue[rx->num_of_queue].name = pcap_filename; - rx->queue[rx->num_of_queue].type = key; - rx->num_of_queue++; + add_queue(rx, pcap_filename, key, pcap, NULL); return 0; } @@ -694,10 +705,7 @@ struct pmd_devargs { if (open_single_tx_pcap(pcap_filename, &dumper) < 0) return -1; - dumpers->queue[dumpers->num_of_queue].dumper = dumper; - dumpers->queue[dumpers->num_of_queue].name = pcap_filename; - dumpers->queue[dumpers->num_of_queue].type = key; - dumpers->num_of_queue++; + add_queue(dumpers, pcap_filename, key, NULL, dumper); return 0; } @@ -722,44 +730,36 @@ struct pmd_devargs { return 0; } -/* - * Opens a NIC for reading packets from it - */ static inline int -open_rx_iface(const char *key, const char *value, void *extra_args) +open_iface(const char *key, const char *value, void *extra_args) { const char *iface = value; - struct pmd_devargs *rx = extra_args; + struct pmd_devargs *pmd = extra_args; pcap_t *pcap = NULL; if (open_single_iface(iface, &pcap) < 0) return -1; - rx->queue[rx->num_of_queue].pcap = pcap; - rx->queue[rx->num_of_queue].name = iface; - rx->queue[rx->num_of_queue].type = key; - rx->num_of_queue++; + add_queue(pmd, iface, key, pcap, NULL); return 0; } /* + * Opens a NIC for reading packets from it + */ +static inline int +open_rx_iface(const char *key, const char *value, void *extra_args) +{ + return open_iface(key, value, extra_args); +} + +/* * Opens a NIC for writing packets to it */ static int open_tx_iface(const char *key, const char *value, void *extra_args) { - const char *iface = value; - struct pmd_devargs *tx = extra_args; - pcap_t *pcap; - - if (open_single_iface(iface, &pcap) < 0) - return -1; - tx->queue[tx->num_of_queue].pcap = pcap; - tx->queue[tx->num_of_queue].name = iface; - tx->queue[tx->num_of_queue].type = key; - tx->num_of_queue++; - - return 0; + return open_iface(key, value, extra_args); } static struct rte_vdev_driver pmd_pcap_drv; -- 1.9.1
[dpdk-dev] [PATCH 1/2] net/pcap: multiple queues fix
Change open_rx/tx_pcap/iface functions to open only a single pcap/dumper and not loop num_of_queue times The num_of_queue loop is already acheived by the caller rte_kvargs_process Fixes: 1. Opens N requested pcaps/dumpers instead of N^2 2. Leak of pcap/dumper's which are being overwritten by the sequential calls to open_rx/tx_pcap/iface functions 3. Use the filename/iface args per queue and not just the last one that overwrites the previous names Signed-off-by: ido goshen --- drivers/net/pcap/rte_eth_pcap.c | 85 + 1 file changed, 35 insertions(+), 50 deletions(-) diff --git a/drivers/net/pcap/rte_eth_pcap.c b/drivers/net/pcap/rte_eth_pcap.c index 6bd4a7d..444abbb 100644 --- a/drivers/net/pcap/rte_eth_pcap.c +++ b/drivers/net/pcap/rte_eth_pcap.c @@ -665,19 +665,17 @@ struct pmd_devargs { static int open_rx_pcap(const char *key, const char *value, void *extra_args) { - unsigned int i; const char *pcap_filename = value; struct pmd_devargs *rx = extra_args; pcap_t *pcap = NULL; - for (i = 0; i < rx->num_of_queue; i++) { - if (open_single_rx_pcap(pcap_filename, &pcap) < 0) - return -1; + if (open_single_rx_pcap(pcap_filename, &pcap) < 0) + return -1; - rx->queue[i].pcap = pcap; - rx->queue[i].name = pcap_filename; - rx->queue[i].type = key; - } + rx->queue[rx->num_of_queue].pcap = pcap; + rx->queue[rx->num_of_queue].name = pcap_filename; + rx->queue[rx->num_of_queue].type = key; + rx->num_of_queue++; return 0; } @@ -689,19 +687,17 @@ struct pmd_devargs { static int open_tx_pcap(const char *key, const char *value, void *extra_args) { - unsigned int i; const char *pcap_filename = value; struct pmd_devargs *dumpers = extra_args; pcap_dumper_t *dumper; - for (i = 0; i < dumpers->num_of_queue; i++) { - if (open_single_tx_pcap(pcap_filename, &dumper) < 0) - return -1; + if (open_single_tx_pcap(pcap_filename, &dumper) < 0) + return -1; - dumpers->queue[i].dumper = dumper; - dumpers->queue[i].name = pcap_filename; - dumpers->queue[i].type = key; - } + dumpers->queue[dumpers->num_of_queue].dumper = dumper; + dumpers->queue[dumpers->num_of_queue].name = pcap_filename; + dumpers->queue[dumpers->num_of_queue].type = key; + dumpers->num_of_queue++; return 0; } @@ -732,18 +728,16 @@ struct pmd_devargs { static inline int open_rx_iface(const char *key, const char *value, void *extra_args) { - unsigned int i; const char *iface = value; struct pmd_devargs *rx = extra_args; pcap_t *pcap = NULL; - for (i = 0; i < rx->num_of_queue; i++) { - if (open_single_iface(iface, &pcap) < 0) - return -1; - rx->queue[i].pcap = pcap; - rx->queue[i].name = iface; - rx->queue[i].type = key; - } + if (open_single_iface(iface, &pcap) < 0) + return -1; + rx->queue[rx->num_of_queue].pcap = pcap; + rx->queue[rx->num_of_queue].name = iface; + rx->queue[rx->num_of_queue].type = key; + rx->num_of_queue++; return 0; } @@ -754,18 +748,16 @@ struct pmd_devargs { static int open_tx_iface(const char *key, const char *value, void *extra_args) { - unsigned int i; const char *iface = value; struct pmd_devargs *tx = extra_args; pcap_t *pcap; - for (i = 0; i < tx->num_of_queue; i++) { - if (open_single_iface(iface, &pcap) < 0) - return -1; - tx->queue[i].pcap = pcap; - tx->queue[i].name = iface; - tx->queue[i].type = key; - } + if (open_single_iface(iface, &pcap) < 0) + return -1; + tx->queue[tx->num_of_queue].pcap = pcap; + tx->queue[tx->num_of_queue].name = iface; + tx->queue[tx->num_of_queue].type = key; + tx->num_of_queue++; return 0; } @@ -958,15 +950,8 @@ struct pmd_devargs { * We check whether we want to open a RX stream from a real NIC or a * pcap file */ - pcaps.num_of_queue = rte_kvargs_count(kvlist, ETH_PCAP_RX_PCAP_ARG); - if (pcaps.num_of_queue) - is_rx_pcap = 1; - else - pcaps.num_of_queue = rte_kvargs_count(kvlist, - ETH_PCAP_RX_IFACE_ARG); - - if (pcaps.num_of_queue > RTE_PMD_PCAP_MAX_QUEUES) - pcaps.num_of_queue = RTE_PMD_PCAP_MAX_QUEUES; + is_rx_pcap = rte_kvargs_count(kvlist, ETH_PCAP_RX_PCAP_ARG) ? 1 : 0; + pcaps.num_of_queue = 0; if (is_rx_pcap) ret = rte_kvargs_process(kvlist, ETH_PCAP_RX_PCAP_ARG, @@ -975,6 +960,10 @@ str
[dpdk-dev] [PATCH v2 0/3] Support UDP/IPv4 GSO
With the support of UDP Fragmentation Offload (UFO) and TCP Segmentation Offload (TSO) in virtio, VMs can exchange large UDP and TCP packets exceeding MTU between each other, which can greatly reduce per-packet processing overheads. When the destination of the large TCP and UDP packets is crossing machines, the host application needs to call two different libraries, GSO and IP fragmentation, to split the large packets respectively. However,the GSO and IP fragmentation library have quite different APIs, which greatly complicates the host application implementation. To simplify application development, we propose to support UDP/IPv4 fragmentation in the GSO library. With supporting UDP GSO, host applicationss can use the unified APIs to split large UDP and TCP packets. This patchset is to support UDP/IPv4 GSO. The first patch is to provide UDP GSO function, the second patch is to enable UDP/IPv4 GSO in the testpmd checksum forwarding engine, and the last patch is to update the programmer guide and testpmd user guide. Change log == V2: - fix fragment offset calculation bug. - add UDP GSO description in testpmd user guide. - shorten the second patch name. Jiayu Hu (3): gso: support UDP/IPv4 fragmentation app/testpmd: enable UDP GSO in csum engine gso: update documents for UDP/IPv4 GSO app/test-pmd/cmdline.c | 5 +- app/test-pmd/csumonly.c| 2 + app/test-pmd/testpmd.c | 2 +- .../generic_segmentation_offload_lib.rst | 6 ++ doc/guides/testpmd_app_ug/testpmd_funcs.rst| 6 ++ lib/librte_gso/Makefile| 1 + lib/librte_gso/gso_common.h| 3 + lib/librte_gso/gso_udp4.c | 81 ++ lib/librte_gso/gso_udp4.h | 42 +++ lib/librte_gso/rte_gso.c | 24 +-- lib/librte_gso/rte_gso.h | 6 +- 11 files changed, 169 insertions(+), 9 deletions(-) create mode 100644 lib/librte_gso/gso_udp4.c create mode 100644 lib/librte_gso/gso_udp4.h -- 2.7.4
[dpdk-dev] [PATCH v2 1/3] gso: support UDP/IPv4 fragmentation
This patch adds GSO support for UDP/IPv4 packets. Supported packets may include a single VLAN tag. UDP/IPv4 GSO doesn't check if input packets have correct checksums, and doesn't update checksums for output packets (the responsibility for this lies with the application). Additionally, UDP/IPv4 GSO doesn't process IP fragmented packets. UDP/IPv4 GSO uses two chained MBUFs, one direct MBUF and one indrect MBUF, to organize an output packet. The direct MBUF stores the packet header, while the indirect mbuf simply points to a location within the original packet's payload. Consequently, use of UDP GSO requires multi-segment MBUF support in the TX functions of the NIC driver. If a packet is GSO'd, UDP/IPv4 GSO reduces its MBUF refcnt by 1. As a result, when all of its GSOed segments are freed, the packet is freed automatically. Signed-off-by: Jiayu Hu --- lib/librte_gso/Makefile | 1 + lib/librte_gso/gso_common.h | 3 ++ lib/librte_gso/gso_udp4.c | 81 + lib/librte_gso/gso_udp4.h | 42 +++ lib/librte_gso/rte_gso.c| 24 +++--- lib/librte_gso/rte_gso.h| 6 +++- 6 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 lib/librte_gso/gso_udp4.c create mode 100644 lib/librte_gso/gso_udp4.h diff --git a/lib/librte_gso/Makefile b/lib/librte_gso/Makefile index 3648ec0..1fac53a 100644 --- a/lib/librte_gso/Makefile +++ b/lib/librte_gso/Makefile @@ -19,6 +19,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_GSO) += rte_gso.c SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_common.c SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tcp4.c SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_tunnel_tcp4.c +SRCS-$(CONFIG_RTE_LIBRTE_GSO) += gso_udp4.c # install this header file SYMLINK-$(CONFIG_RTE_LIBRTE_GSO)-include += rte_gso.h diff --git a/lib/librte_gso/gso_common.h b/lib/librte_gso/gso_common.h index 5ca5974..6cd764f 100644 --- a/lib/librte_gso/gso_common.h +++ b/lib/librte_gso/gso_common.h @@ -31,6 +31,9 @@ (PKT_TX_TCP_SEG | PKT_TX_IPV4 | PKT_TX_OUTER_IPV4 | \ PKT_TX_TUNNEL_GRE)) +#define IS_IPV4_UDP(flag) (((flag) & (PKT_TX_UDP_SEG | PKT_TX_IPV4)) == \ + (PKT_TX_UDP_SEG | PKT_TX_IPV4)) + /** * Internal function which updates the UDP header of a packet, following * segmentation. This is required to update the header's datagram length field. diff --git a/lib/librte_gso/gso_udp4.c b/lib/librte_gso/gso_udp4.c new file mode 100644 index 000..a3db329 --- /dev/null +++ b/lib/librte_gso/gso_udp4.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include "gso_common.h" +#include "gso_udp4.h" + +#define IPV4_HDR_MF_BIT (1U << 13) + +static inline void +update_ipv4_udp_headers(struct rte_mbuf *pkt, struct rte_mbuf **segs, + uint16_t nb_segs) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t frag_offset = 0, is_mf; + uint16_t l2_hdrlen = pkt->l2_len, l3_hdrlen = pkt->l3_len; + uint16_t tail_idx = nb_segs - 1, length, i; + + /* +* Update IP header fields for output segments. Specifically, +* keep the same IP id, update fragment offset and total +* length. +*/ + for (i = 0; i < nb_segs; i++) { + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(segs[i], + char *) + l2_hdrlen); + length = segs[i]->pkt_len - l2_hdrlen; + ipv4_hdr->total_length = rte_cpu_to_be_16(length); + + is_mf = i < tail_idx ? IPV4_HDR_MF_BIT : 0; + ipv4_hdr->fragment_offset = + rte_cpu_to_be_16(frag_offset | is_mf); + frag_offset += ((length - l3_hdrlen) >> 3); + } +} + +int +gso_udp4_segment(struct rte_mbuf *pkt, + uint16_t gso_size, + struct rte_mempool *direct_pool, + struct rte_mempool *indirect_pool, + struct rte_mbuf **pkts_out, + uint16_t nb_pkts_out) +{ + struct ipv4_hdr *ipv4_hdr; + uint16_t pyld_unit_size, hdr_offset; + uint16_t frag_off; + int ret; + + /* Don't process the fragmented packet */ + ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + + pkt->l2_len); + frag_off = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); + if (unlikely(IS_FRAGMENTED(frag_off))) { + pkts_out[0] = pkt; + return 1; + } + + /* +* UDP fragmentation is the same as IP fragmentation. +* Except the first one, other output packets just have l2 +* and l3 headers. +*/ + hdr_offset = pkt->l2_len + pkt->l3_len; + + /* Don't process the packet without data. */ + if (unlikely(hdr_offset + pkt->l4_len >= pkt->pkt_len)) { + pkts_out[0] = pkt; + return 1; + } + + pyld_unit_size = gso_size - hdr_offset; + + /* Segment
[dpdk-dev] [PATCH v2 3/3] gso: update documents for UDP/IPv4 GSO
This patch updates the programmer guide and testpmd user guide for UDP/IPv4 GSO. Signed-off-by: Jiayu Hu --- doc/guides/prog_guide/generic_segmentation_offload_lib.rst | 6 ++ doc/guides/testpmd_app_ug/testpmd_funcs.rst| 6 ++ 2 files changed, 12 insertions(+) diff --git a/doc/guides/prog_guide/generic_segmentation_offload_lib.rst b/doc/guides/prog_guide/generic_segmentation_offload_lib.rst index 9959f0d..cf700c5 100644 --- a/doc/guides/prog_guide/generic_segmentation_offload_lib.rst +++ b/doc/guides/prog_guide/generic_segmentation_offload_lib.rst @@ -43,6 +43,7 @@ Limitations #. Currently, the GSO library supports the following IPv4 packet types: - TCP + - UDP - VxLAN - GRE @@ -146,6 +147,11 @@ TCP/IPv4 GSO TCP/IPv4 GSO supports segmentation of suitably large TCP/IPv4 packets, which may also contain an optional VLAN tag. +UDP/IPv4 GSO + +UDP/IPv4 GSO supports segmentation of suitably large UDP/IPv4 packets, which +may also contain an optional VLAN tag. + VxLAN GSO ~ VxLAN packets GSO supports segmentation of suitably large VxLAN packets, diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst index 0d6fd50..1a9dcd8 100644 --- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst +++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst @@ -1059,6 +1059,12 @@ By default, GSO is disabled for all ports. testpmd> csum set tcp hw + UDP GSO is the same as IP fragmentation, which treats the UDP header + as the payload and does not modify it during segmentation. That is, + after UDP GSO, only the first output fragment has the original UDP + header. Therefore, users just need to enable HW IPv4 checksum + calculation for GSO-enabled ports, when input packets are UDP/IPv4. + set gso segsz ~ -- 2.7.4
[dpdk-dev] [PATCH v2 2/3] app/testpmd: enable UDP GSO in csum engine
This patch enables GSO for UDP/IPv4 packets. Oversized UDP/IPv4 packets transmitted over a GSO-enabled port will undergo segmentation. Signed-off-by: Jiayu Hu --- app/test-pmd/cmdline.c | 5 +++-- app/test-pmd/csumonly.c | 2 ++ app/test-pmd/testpmd.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index 27e2aa8..4239e91 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -4792,8 +4792,9 @@ cmd_gso_show_parsed(void *parsed_result, if (gso_ports[res->cmd_pid].enable) { printf("Max GSO'd packet size: %uB\n" "Supported GSO types: TCP/IPv4, " - "VxLAN with inner TCP/IPv4 packet, " - "GRE with inner TCP/IPv4 packet\n", + "UDP/IPv4, VxLAN with inner " + "TCP/IPv4 packet, GRE with inner " + "TCP/IPv4 packet\n", gso_max_segment_size); } else printf("GSO is not enabled on Port %u\n", res->cmd_pid); diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c index 0bb88cf..4948292 100644 --- a/app/test-pmd/csumonly.c +++ b/app/test-pmd/csumonly.c @@ -411,6 +411,8 @@ process_inner_cksums(void *l3_hdr, const struct testpmd_offload_info *info, info->ethertype); } } + if (info->gso_enable) + ol_flags |= PKT_TX_UDP_SEG; } else if (info->l4_proto == IPPROTO_TCP) { tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + info->l3_len); tcp_hdr->cksum = 0; diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index 35cf266..b5766ff 100644 --- a/app/test-pmd/testpmd.c +++ b/app/test-pmd/testpmd.c @@ -777,7 +777,7 @@ init_config(void) init_port_config(); gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO | - DEV_TX_OFFLOAD_GRE_TNL_TSO; + DEV_TX_OFFLOAD_GRE_TNL_TSO | DEV_TX_OFFLOAD_UDP_TSO; /* * Records which Mbuf pool to use by each logical core, if needed. */ -- 2.7.4