[dpdk-dev] [PATCH v7 0/5] ethdev: add speed capabilities and refactor link API
The current rte_eth_dev_info abstraction does not provide any mechanism to get the supported speed(s) of an ethdev. For some drivers (e.g. ixgbe), an educated guess could be done based on the driver's name (driver_name in rte_eth_dev_info), see: http://dpdk.org/ml/archives/dev/2013-August/000412.html However, i) doing string comparisons is annoying, and can silently break existing applications if PMDs change their names ii) it does not provide all the supported capabilities of the ethdev iii) for some drivers it is impossible determine correctly the (max) speed by the application (e.g. in i40, distinguish between XL710 and X710). In addition, the link APIs do not allow to define a set of advertised link speeds for autonegociation. This series of patches adds the following capabilities: * speed_capa bitmap in rte_eth_dev_info, which is filled by the PMDs according to the physical device capabilities. * refactors link API in ethdev to allow the definition of the advertised link speeds, fix speed (no auto-negociation) or advertise all supported speeds (default). WARNING: this patch series, specifically 3/4, is NOT tested for most of the PMDs, due to the lack of hardware. Only generic EM is tested (VM). Reviewing and testing required by PMD maintainers. * * * * * v2: rebase, converted speed_capa into 32 bits bitmap, fixed alignment (checkpatch). v3: rebase to v2.1. unified ETH_LINK_SPEED and ETH_SPEED_CAP into ETH_SPEED. Converted field speed in struct rte_eth_conf to speed, to allow a bitmap for defining the announced speeds, as suggested M. Brorup. Fixed spelling issues. v4: fixed errata in the documentation of field speeds of rte_eth_conf, and commit 1/2 message. rebased to v2.1.0. v3 was incorrectly based on ~2.1.0-rc1. v5: revert to v2 speed capabilities patch. Fixed MLX4 speed capabilities (thanks N. Laranjeiro). Refactored link speed API to allow setting advertised speeds (3/4). Added NO_AUTONEG option to explicitely disable auto-negociation. Updated 2.2 rel. notes (4/4). Rebased to current HEAD. v6: Move link_duplex to be part of bitfield. Fixed i40 autoneg flag link update code. Added rte_eth_speed_to_bm_flag() to .map file. Fixed other spelling issues. Rebased to current HEAD. v7: Rebased to current HEAD. Moved documentation to v2.3. Still needs testing from PMD maintainers. Marc Sune (5): ethdev: Added ETH_SPEED_CAP bitmap for ports ethdev: Fill speed capability bitmaps in the PMDs ethdev: redesign link speed config API doc: update with link changes ethdev: add rte_eth_speed_to_bm_flag() to ver. map app/test-pmd/cmdline.c | 124 +++-- app/test/virtual_pmd.c | 4 +- doc/guides/rel_notes/release_2_2.rst | 23 ++ drivers/net/af_packet/rte_eth_af_packet.c | 5 +- drivers/net/bonding/rte_eth_bond_8023ad.c | 14 ++-- drivers/net/cxgbe/base/t4_hw.c | 8 +- drivers/net/e1000/base/e1000_80003es2lan.c | 6 +- drivers/net/e1000/base/e1000_82541.c | 8 +- drivers/net/e1000/base/e1000_82543.c | 4 +- drivers/net/e1000/base/e1000_82575.c | 11 +-- drivers/net/e1000/base/e1000_api.c | 2 +- drivers/net/e1000/base/e1000_api.h | 2 +- drivers/net/e1000/base/e1000_defines.h | 4 +- drivers/net/e1000/base/e1000_hw.h | 2 +- drivers/net/e1000/base/e1000_ich8lan.c | 4 +- drivers/net/e1000/base/e1000_mac.c | 9 ++- drivers/net/e1000/base/e1000_mac.h | 6 +- drivers/net/e1000/base/e1000_vf.c | 4 +- drivers/net/e1000/base/e1000_vf.h | 2 +- drivers/net/e1000/em_ethdev.c | 109 - drivers/net/e1000/igb_ethdev.c | 104 +--- drivers/net/fm10k/fm10k_ethdev.c | 5 +- drivers/net/i40e/i40e_ethdev.c | 78 ++ drivers/net/i40e/i40e_ethdev_vf.c | 11 +-- drivers/net/ixgbe/ixgbe_ethdev.c | 74 - drivers/net/mlx4/mlx4.c| 6 ++ drivers/net/mpipe/mpipe_tilegx.c | 6 +- drivers/net/null/rte_eth_null.c| 5 +- drivers/net/pcap/rte_eth_pcap.c| 9 ++- drivers/net/ring/rte_eth_ring.c| 5 +- drivers/net/virtio/virtio_ethdev.c | 2 +- drivers/net/virtio/virtio_ethdev.h | 2 - drivers/net/vmxnet3/vmxnet3_ethdev.c | 5 +- drivers/net/xenvirt/rte_eth_xenvirt.c | 5 +- examples/ip_pipeline/config_parse.c| 3 +- lib/librte_ether/rte_ethdev.c | 49 lib/librte_ether/rte_ethdev.h | 97 +- lib/librte_ether/rte_ether_version.map | 6 ++ 38 files changed, 501 insertions(+), 322 deletions(-) -- 2.1.4
[dpdk-dev] [PATCH v7 1/5] ethdev: Added ETH_SPEED_CAP bitmap for ports
Added constants and bitmap to struct rte_eth_dev_info to be used by PMDs. Signed-off-by: Marc Sune --- lib/librte_ether/rte_ethdev.h | 24 1 file changed, 24 insertions(+) diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h index 8710dd7..dbc1599 100644 --- a/lib/librte_ether/rte_ethdev.h +++ b/lib/librte_ether/rte_ethdev.h @@ -824,6 +824,29 @@ struct rte_eth_conf { #define DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM 0x0080 /**< Used for tunneling packet. */ #define DEV_TX_OFFLOAD_QINQ_INSERT 0x0100 +/** + * Device supported speeds + */ +#define ETH_SPEED_CAP_NOT_PHY (0) /*< No phy media > */ +#define ETH_SPEED_CAP_10M_HD (1 << 0) /*< 10 Mbps half-duplex> */ +#define ETH_SPEED_CAP_10M_FD (1 << 1) /*< 10 Mbps full-duplex> */ +#define ETH_SPEED_CAP_100M_HD (1 << 2) /*< 100 Mbps half-duplex> */ +#define ETH_SPEED_CAP_100M_FD (1 << 3) /*< 100 Mbps full-duplex> */ +#define ETH_SPEED_CAP_1G (1 << 4) /*< 1 Gbps > */ +#define ETH_SPEED_CAP_2_5G (1 << 5) /*< 2.5 Gbps > */ +#define ETH_SPEED_CAP_5G (1 << 6) /*< 5 Gbps > */ +#define ETH_SPEED_CAP_10G (1 << 7) /*< 10 Mbps > */ +#define ETH_SPEED_CAP_20G (1 << 8) /*< 20 Gbps > */ +#define ETH_SPEED_CAP_25G (1 << 9) /*< 25 Gbps > */ +#define ETH_SPEED_CAP_40G (1 << 10) /*< 40 Gbps > */ +#define ETH_SPEED_CAP_50G (1 << 11) /*< 50 Gbps > */ +#define ETH_SPEED_CAP_56G (1 << 12) /*< 56 Gbps > */ +#define ETH_SPEED_CAP_100G (1 << 13) /*< 100 Gbps > */ + + +/** + * Ethernet device information + */ struct rte_eth_dev_info { struct rte_pci_device *pci_dev; /**< Device PCI information. */ const char *driver_name; /**< Device Driver name. */ @@ -852,6 +875,7 @@ struct rte_eth_dev_info { uint16_t vmdq_pool_base; /**< First ID of VMDQ pools. */ struct rte_eth_desc_lim rx_desc_lim; /**< RX descriptors limits */ struct rte_eth_desc_lim tx_desc_lim; /**< TX descriptors limits */ + uint32_t speed_capa; /**< Supported speeds bitmap (ETH_SPEED_CAP_). */ }; /** -- 2.1.4
[dpdk-dev] [PATCH v7 2/5] ethdev: Fill speed capability bitmaps in the PMDs
Added speed capabilities to all pmds supporting physical NICs: * e1000 * ixgbe * i40 * mlx4 * fm10k Signed-off-by: Marc Sune --- drivers/net/e1000/em_ethdev.c| 6 ++ drivers/net/e1000/igb_ethdev.c | 6 ++ drivers/net/fm10k/fm10k_ethdev.c | 4 drivers/net/i40e/i40e_ethdev.c | 9 + drivers/net/ixgbe/ixgbe_ethdev.c | 10 ++ drivers/net/mlx4/mlx4.c | 4 6 files changed, 39 insertions(+) diff --git a/drivers/net/e1000/em_ethdev.c b/drivers/net/e1000/em_ethdev.c index 66e8993..fd48bbf 100644 --- a/drivers/net/e1000/em_ethdev.c +++ b/drivers/net/e1000/em_ethdev.c @@ -1023,6 +1023,12 @@ eth_em_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) .nb_min = E1000_MIN_RING_DESC, .nb_align = EM_TXD_ALIGN, }; + + dev_info->speed_capa = ETH_SPEED_CAP_10M_HD | + ETH_SPEED_CAP_10M_FD | + ETH_SPEED_CAP_100M_HD | + ETH_SPEED_CAP_100M_FD | + ETH_SPEED_CAP_1G; } /* return 0 means link status changed, -1 means not changed */ diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c index d1bbcda..9c8dffa 100644 --- a/drivers/net/e1000/igb_ethdev.c +++ b/drivers/net/e1000/igb_ethdev.c @@ -1908,6 +1908,12 @@ eth_igb_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->rx_desc_lim = rx_desc_lim; dev_info->tx_desc_lim = tx_desc_lim; + + dev_info->speed_capa = ETH_SPEED_CAP_10M_HD | + ETH_SPEED_CAP_10M_FD | + ETH_SPEED_CAP_100M_HD | + ETH_SPEED_CAP_100M_FD | + ETH_SPEED_CAP_1G; } static void diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c index e4aed94..aeb2962 100644 --- a/drivers/net/fm10k/fm10k_ethdev.c +++ b/drivers/net/fm10k/fm10k_ethdev.c @@ -1333,6 +1333,10 @@ fm10k_dev_infos_get(struct rte_eth_dev *dev, .nb_min = FM10K_MIN_TX_DESC, .nb_align = FM10K_MULT_TX_DESC, }; + + dev_info->speed_capa = ETH_SPEED_CAP_1G | ETH_SPEED_CAP_2_5G | + ETH_SPEED_CAP_10G | ETH_SPEED_CAP_25G | + ETH_SPEED_CAP_40G | ETH_SPEED_CAP_100G; } static int diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c index bf6220d..d242973 100644 --- a/drivers/net/i40e/i40e_ethdev.c +++ b/drivers/net/i40e/i40e_ethdev.c @@ -2233,6 +2233,7 @@ static void i40e_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { struct i40e_pf *pf = I40E_DEV_PRIVATE_TO_PF(dev->data->dev_private); + struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data->dev_private); struct i40e_vsi *vsi = pf->main_vsi; dev_info->max_rx_queues = vsi->nb_qps; @@ -2304,6 +2305,14 @@ i40e_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->max_rx_queues += dev_info->vmdq_queue_num; dev_info->max_tx_queues += dev_info->vmdq_queue_num; } + + if (i40e_is_40G_device(hw->device_id)) + /* For XL710 */ + dev_info->speed_capa = ETH_SPEED_CAP_1G | ETH_SPEED_CAP_10G; + else + /* For X710 */ + dev_info->speed_capa = ETH_SPEED_CAP_10G | ETH_SPEED_CAP_40G; + } static int diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 4c4c6df..4e3ab3d 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -2827,6 +2827,16 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->hash_key_size = IXGBE_HKEY_MAX_INDEX * sizeof(uint32_t); dev_info->reta_size = ixgbe_reta_size_get(hw->mac.type); dev_info->flow_type_rss_offloads = IXGBE_RSS_OFFLOAD_ALL; + + dev_info->speed_capa = ETH_SPEED_CAP_1G | ETH_SPEED_CAP_10G; + + if (hw->mac.type == ixgbe_mac_X540 || + hw->mac.type == ixgbe_mac_X540_vf || + hw->mac.type == ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_X550_vf) + + dev_info->speed_capa |= ETH_SPEED_CAP_100M_FD /*| + ETH_SPEED_CAP_100M_HD*/; } static void diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index 207bfe2..1f3ed59 100644 --- a/drivers/net/mlx4/mlx4.c +++ b/drivers/net/mlx4/mlx4.c @@ -4265,6 +4265,10 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) 0); if (priv_get_ifname(priv, &ifname) == 0) info->if_index = if_nametoindex(ifname); + + info->speed_capa = ETH_SPEED_CAP_10G |ETH_SPEED_CAP_40G | + ETH_SPE
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
This patch redesigns the API to set the link speed/s configure for an ethernet port. Specifically: - it allows to define a set of advertised speeds for auto-negociation. - it allows to disable link auto-negociation (single fixed speed). - default: auto-negociate all supported speeds. Other changes: * Added utility MACROs ETH_SPEED_NUM_XXX with the numeric values of all supported link speeds, in Mbps. * Converted link_speed to uint32_t to accomodate 100G speeds (bug). * Added autoneg flag in struct rte_eth_link to indicate if link speed was a result of auto-negociation or was fixed by configuration. * Added utility function to convert numeric speeds to bitmap fields. Signed-off-by: Marc Sune --- app/test-pmd/cmdline.c | 124 +++-- app/test/virtual_pmd.c | 4 +- drivers/net/af_packet/rte_eth_af_packet.c | 5 +- drivers/net/bonding/rte_eth_bond_8023ad.c | 14 ++-- drivers/net/cxgbe/base/t4_hw.c | 8 +- drivers/net/e1000/base/e1000_80003es2lan.c | 6 +- drivers/net/e1000/base/e1000_82541.c | 8 +- drivers/net/e1000/base/e1000_82543.c | 4 +- drivers/net/e1000/base/e1000_82575.c | 11 +-- drivers/net/e1000/base/e1000_api.c | 2 +- drivers/net/e1000/base/e1000_api.h | 2 +- drivers/net/e1000/base/e1000_defines.h | 4 +- drivers/net/e1000/base/e1000_hw.h | 2 +- drivers/net/e1000/base/e1000_ich8lan.c | 7 +- drivers/net/e1000/base/e1000_mac.c | 9 ++- drivers/net/e1000/base/e1000_mac.h | 6 +- drivers/net/e1000/base/e1000_vf.c | 4 +- drivers/net/e1000/base/e1000_vf.h | 2 +- drivers/net/e1000/em_ethdev.c | 113 +- drivers/net/e1000/igb_ethdev.c | 108 + drivers/net/fm10k/fm10k_ethdev.c | 8 +- drivers/net/i40e/i40e_ethdev.c | 73 - drivers/net/i40e/i40e_ethdev_vf.c | 11 +-- drivers/net/ixgbe/ixgbe_ethdev.c | 78 -- drivers/net/mlx4/mlx4.c| 2 + drivers/net/mpipe/mpipe_tilegx.c | 6 +- drivers/net/null/rte_eth_null.c| 5 +- drivers/net/pcap/rte_eth_pcap.c| 9 ++- drivers/net/ring/rte_eth_ring.c| 5 +- drivers/net/virtio/virtio_ethdev.c | 2 +- drivers/net/virtio/virtio_ethdev.h | 2 - drivers/net/vmxnet3/vmxnet3_ethdev.c | 5 +- drivers/net/xenvirt/rte_eth_xenvirt.c | 5 +- examples/ip_pipeline/config_parse.c| 3 +- lib/librte_ether/rte_ethdev.c | 49 lib/librte_ether/rte_ethdev.h | 113 -- 36 files changed, 454 insertions(+), 365 deletions(-) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index 6d28c1b..00571bc 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -956,14 +956,65 @@ struct cmd_config_speed_all { cmdline_fixed_string_t value2; }; +static int +parse_and_check_speed_duplex(char *value1, char *value2, uint32_t *link_speed) +{ + + int duplex; + + if (!strcmp(value2, "half")) { + duplex = 0; + } else if (!strcmp(value2, "full")) { + duplex = 1; + } else if (!strcmp(value2, "auto")) { + duplex = 1; + } else { + printf("Unknown parameter\n"); + return -1; + } + + if (!strcmp(value1, "10")) { + *link_speed = (duplex) ? ETH_LINK_SPEED_10M : + ETH_LINK_SPEED_10M_HD; + } else if (!strcmp(value1, "100")) { + *link_speed = (duplex) ? ETH_LINK_SPEED_100M : + ETH_LINK_SPEED_100M_HD; + } else if (!strcmp(value1, "1000")) { + if (!duplex) + goto invalid_speed_param; + *link_speed = ETH_LINK_SPEED_1G; + } else if (!strcmp(value1, "1")) { + if (!duplex) + goto invalid_speed_param; + *link_speed = ETH_LINK_SPEED_10G; + } else if (!strcmp(value1, "4")) { + if (!duplex) + goto invalid_speed_param; + *link_speed = ETH_LINK_SPEED_40G; + } else if (!strcmp(value1, "auto")) { + if (!duplex) + goto invalid_speed_param; + *link_speed = ETH_LINK_SPEED_AUTONEG; + } else { + printf("Unknown parameter\n"); + return -1; + } + + return 0; + +invalid_speed_param: + + printf("Invalid speed parameter\n"); + return -1; +} + static void cmd_config_speed_all_parsed(void *parsed_result, __attribute__((unused)) struct cmdline *cl, __attribute__((unused)) void *data) {
[dpdk-dev] [PATCH v7 4/5] doc: update with link changes
Add new features, ABI changes and resolved issues notice for the refactored link patch. Signed-off-by: Marc Sune --- doc/guides/rel_notes/release_2_3.rst | 26 ++ 1 file changed, 26 insertions(+) diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..b10c3bb 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -4,10 +4,28 @@ DPDK Release 2.3 New Features +* **ethdev: define a set of advertised link speeds.** + + Allowing to define a set of advertised speeds for auto-negociation, + explicitely disable link auto-negociation (single speed) and full + auto-negociation. + +* **ethdev: add speed_cap bitmap to recover eth device link speed capabilities + define a set of advertised link speeds.** + + ``struct rte_eth_dev_info`` has now speed_cap bitmap, which allows the + application to recover the supported speeds for that ethernet device. + Resolved Issues --- +* **ethdev: Fixed link_speed overflow in rte_eth_link for 100Gbps.** + + 100Gbps in Mbps (10) exceeds 16 bit max value of ``link_speed`` in + ``rte_eth_link``. + + EAL ~~~ @@ -23,6 +41,9 @@ Libraries Examples +* New API call, rte_eth_speed_to_bm_flag(), in ethdev to map numerical speeds + to bitmap fields. + Other ~ @@ -39,6 +60,11 @@ API Changes ABI Changes --- +* The ethdev rte_eth_link and rte_eth_conf structures were changed to + support the new link API, as well as ETH_LINK_HALF/FULL_DUPLEX. + +* The ethdev rte_eth_dev_info was changed to support device speed capabilities. + Shared Library Versions --- -- 2.1.4
[dpdk-dev] [PATCH v7 5/5] ethdev: add rte_eth_speed_to_bm_flag() to ver. map
Added rte_eth_speed_to_bm_flag() to DPDK2.2 version map. Signed-off-by: Marc Sune --- lib/librte_ether/rte_ether_version.map | 6 ++ 1 file changed, 6 insertions(+) diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map index d8db24d..2c14ad7 100644 --- a/lib/librte_ether/rte_ether_version.map +++ b/lib/librte_ether/rte_ether_version.map @@ -117,3 +117,9 @@ DPDK_2.2 { local: *; }; + +DPDK_2.3 { + global: + + rte_eth_speed_to_bm_flag; +}DPDK_2.2; -- 2.1.4
[dpdk-dev] [RFC PATCH 0/2] Fix examples/distributor build issue for non x86
On Sun, Dec 06, 2015 at 08:54:28PM +0530, Jerin Jacob wrote: > Introduced rte_prefetch_non_temporal() to remove IA specific > _mm_prefect(addr, 0) > gcc intrinsic and build examples/distributor for non 86 platform ping for review. > > Not sure the rte_prefetch_non_temporal mapping correct for > all the platforms. Architecture maintainers please check the mapping for > rte_prefetch_non_temporal() for specific architecures > > Jerin Jacob (2): > eal: introduce rte_prefetch_non_temporal > examples/distributor: remove IA specific __mm_prefetch > > examples/distributor/main.c | 9 + > lib/librte_eal/common/include/arch/arm/rte_prefetch_32.h | 5 + > lib/librte_eal/common/include/arch/arm/rte_prefetch_64.h | 5 + > lib/librte_eal/common/include/arch/ppc_64/rte_prefetch.h | 5 + > lib/librte_eal/common/include/arch/tile/rte_prefetch.h | 5 + > lib/librte_eal/common/include/arch/x86/rte_prefetch.h| 5 + > lib/librte_eal/common/include/generic/rte_prefetch.h | 12 > 7 files changed, 42 insertions(+), 4 deletions(-) > > -- > 2.1.0 >
[dpdk-dev] [PATCH v3 0/3] add lpm support for NEON
- This patch enables lpm for ARM - Used architecture agnostic xmm_t to represent 128 bit SIMD variable in rte_lpm_lookupx4 API definition - Tested on Juno and Thunderx boards - Tested and verified the changes with following DPDK unit test cases --lpm_autotest --lpm6_autotest v1..v2 - make rte_lpm_lookupx4 API definition architecture agnostic - vect_* abstraction scope reduce to only app/test as this abstraction used only to load/store and set vectors in test application which is the consumer of rte_lpm_lookupx4 like API - support for armv7 apart from armv8 - taken changes from Jianbo's lpm patches v2..v3 - add Acked-by for 0001-lpm-make-rte_lpm_lookupx4-API-definition-architectur.patch - re-based to DPDK 2.2 -- fixed the conflict in config/defconfig_arm-armv7a-linuxapp-gcc and MAINTAINERS file Jerin Jacob (3): lpm: make rte_lpm_lookupx4 API definition architecture agnostic lpm: add support for NEON maintainers: claim responsibility for arm64 specific files of hash and lpm MAINTAINERS| 3 + app/test/test_lpm.c| 21 ++-- app/test/test_xmmt_ops.h | 67 + config/defconfig_arm-armv7a-linuxapp-gcc | 3 - config/defconfig_arm64-armv8a-linuxapp-gcc | 3 - lib/librte_lpm/Makefile| 6 ++ lib/librte_lpm/rte_lpm.h | 99 ++- lib/librte_lpm/rte_lpm_neon.h | 148 + lib/librte_lpm/rte_lpm_sse.h | 143 9 files changed, 386 insertions(+), 107 deletions(-) create mode 100644 app/test/test_xmmt_ops.h create mode 100644 lib/librte_lpm/rte_lpm_neon.h create mode 100644 lib/librte_lpm/rte_lpm_sse.h -- 2.1.0
[dpdk-dev] [PATCH v3 1/3] lpm: make rte_lpm_lookupx4 API definition architecture agnostic
-Used architecture agnostic xmm_t to represent 128 bit SIMD variable -Introduced vect_* API abstraction in app/test to test rte_lpm_lookupx4 API in architecture agnostic way -Moved rte_lpm_lookupx4 SSE implementation to architecture specific rte_lpm_sse.h file to accommodate new rte_lpm_lookupx4 implementation for a different architecture. Signed-off-by: Jerin Jacob Acked-by: Konstantin Ananyev --- app/test/test_lpm.c | 21 --- app/test/test_xmmt_ops.h | 47 ++ lib/librte_lpm/Makefile | 2 + lib/librte_lpm/rte_lpm.h | 93 +--- lib/librte_lpm/rte_lpm_sse.h | 143 +++ 5 files changed, 206 insertions(+), 100 deletions(-) create mode 100644 app/test/test_xmmt_ops.h create mode 100644 lib/librte_lpm/rte_lpm_sse.h diff --git a/app/test/test_lpm.c b/app/test/test_lpm.c index 8b4ded9..59674f1 100644 --- a/app/test/test_lpm.c +++ b/app/test/test_lpm.c @@ -49,6 +49,7 @@ #include "rte_lpm.h" #include "test_lpm_routes.h" +#include "test_xmmt_ops.h" #define TEST_LPM_ASSERT(cond) do {\ if (!(cond)) {\ @@ -308,7 +309,7 @@ test6(void) int32_t test7(void) { - __m128i ipx4; + xmm_t ipx4; uint16_t hop[4]; struct rte_lpm *lpm = NULL; uint32_t ip = IPv4(0, 0, 0, 0); @@ -324,7 +325,7 @@ test7(void) status = rte_lpm_lookup(lpm, ip, &next_hop_return); TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add)); - ipx4 = _mm_set_epi32(ip, ip + 0x100, ip - 0x100, ip); + ipx4 = vect_set_epi32(ip, ip + 0x100, ip - 0x100, ip); rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX); TEST_LPM_ASSERT(hop[0] == next_hop_add); TEST_LPM_ASSERT(hop[1] == UINT16_MAX); @@ -354,7 +355,7 @@ test7(void) int32_t test8(void) { - __m128i ipx4; + xmm_t ipx4; uint16_t hop[4]; struct rte_lpm *lpm = NULL; uint32_t ip1 = IPv4(127, 255, 255, 255), ip2 = IPv4(128, 0, 0, 0); @@ -380,7 +381,7 @@ test8(void) TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add)); - ipx4 = _mm_set_epi32(ip2, ip1, ip2, ip1); + ipx4 = vect_set_epi32(ip2, ip1, ip2, ip1); rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX); TEST_LPM_ASSERT(hop[0] == UINT16_MAX); TEST_LPM_ASSERT(hop[1] == next_hop_add); @@ -408,7 +409,7 @@ test8(void) status = rte_lpm_lookup(lpm, ip1, &next_hop_return); TEST_LPM_ASSERT(status == -ENOENT); - ipx4 = _mm_set_epi32(ip1, ip1, ip2, ip2); + ipx4 = vect_set_epi32(ip1, ip1, ip2, ip2); rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX); if (depth != 1) { TEST_LPM_ASSERT(hop[0] == next_hop_add); @@ -850,7 +851,7 @@ test11(void) int32_t test12(void) { - __m128i ipx4; + xmm_t ipx4; uint16_t hop[4]; struct rte_lpm *lpm = NULL; uint32_t ip, i; @@ -872,7 +873,7 @@ test12(void) TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add)); - ipx4 = _mm_set_epi32(ip, ip + 1, ip, ip - 1); + ipx4 = vect_set_epi32(ip, ip + 1, ip, ip - 1); rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX); TEST_LPM_ASSERT(hop[0] == UINT16_MAX); TEST_LPM_ASSERT(hop[1] == next_hop_add); @@ -1289,10 +1290,10 @@ perf_test(void) begin = rte_rdtsc(); for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) { unsigned k; - __m128i ipx4; + xmm_t ipx4; - ipx4 = _mm_loadu_si128((__m128i *)(ip_batch + j)); - ipx4 = *(__m128i *)(ip_batch + j); + ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j)); + ipx4 = *(xmm_t *)(ip_batch + j); rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT16_MAX); for (k = 0; k < RTE_DIM(next_hops); k++) if (unlikely(next_hops[k] == UINT16_MAX)) diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h new file mode 100644 index 000..c055912 --- /dev/null +++ b/app/test/test_xmmt_ops.h @@ -0,0 +1,47 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2015 Cavium Networks. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form mus
[dpdk-dev] [PATCH v3 2/3] lpm: add support for NEON
Enabled CONFIG_RTE_LIBRTE_LPM, CONFIG_RTE_LIBRTE_TABLE, CONFIG_RTE_LIBRTE_PIPELINE libraries for arm and arm64 TABLE, PIPELINE libraries were disabled due to LPM library dependency. Signed-off-by: Jerin Jacob Signed-off-by: Jianbo Liu --- app/test/test_xmmt_ops.h | 20 config/defconfig_arm-armv7a-linuxapp-gcc | 3 - config/defconfig_arm64-armv8a-linuxapp-gcc | 3 - lib/librte_lpm/Makefile| 4 + lib/librte_lpm/rte_lpm.h | 4 + lib/librte_lpm/rte_lpm_neon.h | 148 + 6 files changed, 176 insertions(+), 6 deletions(-) create mode 100644 lib/librte_lpm/rte_lpm_neon.h diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h index c055912..c18fc12 100644 --- a/app/test/test_xmmt_ops.h +++ b/app/test/test_xmmt_ops.h @@ -36,6 +36,24 @@ #include +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) + +/* vect_* abstraction implementation using NEON */ + +/* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/ +#define vect_loadu_sil128(p) vld1q_s32((const int32_t *)p) + +/* sets the 4 signed 32-bit integer values and returns the xmm_t variable */ +static inline xmm_t __attribute__((always_inline)) +vect_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t data[4] = {i0, i1, i2, i3}; + + return vld1q_s32(data); +} + +#else + /* vect_* abstraction implementation using SSE */ /* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/ @@ -44,4 +62,6 @@ /* sets the 4 signed 32-bit integer values and returns the xmm_t variable */ #define vect_set_epi32(i3, i2, i1, i0) _mm_set_epi32(i3, i2, i1, i0) +#endif + #endif /* _TEST_XMMT_OPS_H_ */ diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc index cbebd64..efffa1f 100644 --- a/config/defconfig_arm-armv7a-linuxapp-gcc +++ b/config/defconfig_arm-armv7a-linuxapp-gcc @@ -53,9 +53,6 @@ CONFIG_RTE_LIBRTE_KNI=n CONFIG_RTE_EAL_IGB_UIO=n # fails to compile on ARM -CONFIG_RTE_LIBRTE_LPM=n -CONFIG_RTE_LIBRTE_TABLE=n -CONFIG_RTE_LIBRTE_PIPELINE=n CONFIG_RTE_SCHED_VECTOR=n # cannot use those on ARM diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc index 504f3ed..57f7941 100644 --- a/config/defconfig_arm64-armv8a-linuxapp-gcc +++ b/config/defconfig_arm64-armv8a-linuxapp-gcc @@ -51,7 +51,4 @@ CONFIG_RTE_LIBRTE_IVSHMEM=n CONFIG_RTE_LIBRTE_FM10K_PMD=n CONFIG_RTE_LIBRTE_I40E_PMD=n -CONFIG_RTE_LIBRTE_LPM=n -CONFIG_RTE_LIBRTE_TABLE=n -CONFIG_RTE_LIBRTE_PIPELINE=n CONFIG_RTE_SCHED_VECTOR=n diff --git a/lib/librte_lpm/Makefile b/lib/librte_lpm/Makefile index ce3a1d1..7f93006 100644 --- a/lib/librte_lpm/Makefile +++ b/lib/librte_lpm/Makefile @@ -47,7 +47,11 @@ SRCS-$(CONFIG_RTE_LIBRTE_LPM) := rte_lpm.c rte_lpm6.c # install this header file SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include := rte_lpm.h rte_lpm6.h +ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),) +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_neon.h +else SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_sse.h +endif # this lib needs eal DEPDIRS-$(CONFIG_RTE_LIBRTE_LPM) += lib/librte_eal diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index dfe1378..0c892de 100644 --- a/lib/librte_lpm/rte_lpm.h +++ b/lib/librte_lpm/rte_lpm.h @@ -384,7 +384,11 @@ static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4], uint16_t defv); +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#include "rte_lpm_neon.h" +#else #include "rte_lpm_sse.h" +#endif #ifdef __cplusplus } diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h new file mode 100644 index 000..fcd2a8a --- /dev/null +++ b/lib/librte_lpm/rte_lpm_neon.h @@ -0,0 +1,148 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2015 Cavium Networks. All rights reserved. + * All rights reserved. + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Derived rte_lpm_lookupx4 implementation from lib/librte_lpm/rte_lpm_sse.h + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium Networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYR
[dpdk-dev] [PATCH v3 3/3] maintainers: claim responsibility for arm64 specific files of hash and lpm
Signed-off-by: Jerin Jacob --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b90aeea..e3fab58 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -138,6 +138,9 @@ M: Jerin Jacob M: Jianbo Liu F: lib/librte_eal/common/include/arch/arm/*_64.h F: lib/librte_acl/acl_run_neon.* +F: lib/librte_lpm/rte_lpm_neon.h +F: lib/librte_hash/rte_crc_arm64.h +F: lib/librte_hash/rte_cmp_arm64.h EZchip TILE-Gx M: Zhigang Lu -- 2.1.0
[dpdk-dev] [PATCH] config: cleanup existing RTE_CACHE_LINE_SIZE selection scheme
On Tue, Dec 08, 2015 at 03:03:34PM +0530, Jerin Jacob wrote: > On Mon, Dec 07, 2015 at 03:45:10PM +0100, Thomas Monjalon wrote: > > 2015-12-07 19:52, Jerin Jacob: > > > by default, all the targets will be configured with the 64-byte cache line > > > size, targets which have different cache line size can be overridden > > > through target specific config file. > > > > > > Selected ThunderX and power8 as CONFIG_RTE_CACHE_LINE_SIZE=128 targets > > > based on existing configuration. > > > > > > Signed-off-by: Jerin Jacob > > > > It looks good. > > We are moving from a constant defined on CC command line in most cases > > to a constant included in rte_config.h only. > > Have you checked that the generated libraries are identical? > > Yes, Tested with a 64byte(x86_64-native-linuxapp-gcc) and a 128byte > (arm64-thunderx-linuxapp-gcc) target. > md5sum build/app/test comes same with or without patch. ping > > > It would be good to have a check for POWER8. Chao? > > Thanks
[dpdk-dev] [PATCH v5 01/11] virtio: Introduce config RTE_VIRTIO_INC_VECTOR
Hi Yuan, On Wed, Jan 27, 2016 at 8:03 AM, Yuanhan Liu wrote: > On Wed, Jan 27, 2016 at 07:53:21AM +0530, Santosh Shukla wrote: >> Ping? > > I was on vacation late last week. And I was quite busy till now after > the vacation. So, sorry that I still don't have time to do more detailed > reviews in 1 or 2 days. Hopefully I can make it by this Friday. > > BTW, I had a very glimpse of this patchset, overall, it looks much > better now, except the EAL changes (I'm not the maintainer) and the > virtio io port read/write stuff: Tetsuay suggested to add another > access wraps, but I have few concerns about that. Anyway, I don't > have time for deeper thoughts, and I will re-think it later. > > --yliu did you got the chance for reviewing virtio specific patches? Thanks.
[dpdk-dev] [PATCH v5 01/11] virtio: Introduce config RTE_VIRTIO_INC_VECTOR
On Fri, Jan 29, 2016 at 10:02:26AM +0530, Santosh Shukla wrote: > Hi Yuan, It's Yuanhan, but not Yuan :) > On Wed, Jan 27, 2016 at 8:03 AM, Yuanhan Liu > wrote: > > On Wed, Jan 27, 2016 at 07:53:21AM +0530, Santosh Shukla wrote: > >> Ping? > > > > I was on vacation late last week. And I was quite busy till now after > > the vacation. So, sorry that I still don't have time to do more detailed > > reviews in 1 or 2 days. Hopefully I can make it by this Friday. > > > > BTW, I had a very glimpse of this patchset, overall, it looks much > > better now, except the EAL changes (I'm not the maintainer) and the > > virtio io port read/write stuff: Tetsuay suggested to add another > > access wraps, but I have few concerns about that. Anyway, I don't > > have time for deeper thoughts, and I will re-think it later. > > > > --yliu > > did you got the chance for reviewing virtio specific patches? Thanks. Sorry for the long delay, it's very likely I will check your patches this noon. --yliu
[dpdk-dev] [PATCH v5 01/11] virtio: Introduce config RTE_VIRTIO_INC_VECTOR
On Fri, Jan 29, 2016 at 10:12 AM, Yuanhan Liu wrote: > On Fri, Jan 29, 2016 at 10:02:26AM +0530, Santosh Shukla wrote: >> Hi Yuan, > > It's Yuanhan, but not Yuan :) > Sorry for that, >> On Wed, Jan 27, 2016 at 8:03 AM, Yuanhan Liu >> wrote: >> > On Wed, Jan 27, 2016 at 07:53:21AM +0530, Santosh Shukla wrote: >> >> Ping? >> > >> > I was on vacation late last week. And I was quite busy till now after >> > the vacation. So, sorry that I still don't have time to do more detailed >> > reviews in 1 or 2 days. Hopefully I can make it by this Friday. >> > >> > BTW, I had a very glimpse of this patchset, overall, it looks much >> > better now, except the EAL changes (I'm not the maintainer) and the >> > virtio io port read/write stuff: Tetsuay suggested to add another >> > access wraps, but I have few concerns about that. Anyway, I don't >> > have time for deeper thoughts, and I will re-think it later. >> > >> > --yliu >> >> did you got the chance for reviewing virtio specific patches? Thanks. > > Sorry for the long delay, it's very likely I will check your patches > this noon. > Thanks > --yliu
[dpdk-dev] [PATCH v3 0/8] vhost-user live migration support
This patch set adds the vhost-user live migration support. The major task behind that is to log pages we touched during live migration, including used vring and desc buffer. So, this patch set is basically about adding vhost log support, and using it. Another important thing is that you need notify the switches about the VM location change after migration is done. GUEST_ANNOUNCE feature is for that, which sends an GARP message after migration. For older kernel (<= v3.4) without GUEST_ANNOUNCE support, we construct and broadcast a RARP message, with the mac address from VHOST_USER_SEND_RARP payload. Patchset - Patch 1 handles VHOST_USER_SET_LOG_BASE, which tells us where the dirty memory bitmap is. - Patch 2 introduces a vhost_log_write() helper function to log pages we are gonna change. - Patch 3 logs changes we made to used vring. - Patch 4 logs changes we made to vring desc buffer. - Patch 5 and 7 add some feature bits related to live migration. - patch 6 does the RARP construction and broadcast job. A simple test guide (on same host) == The following test is based on OVS + DPDK (check [0] for how to setup OVS + DPDK): [0]: http://wiki.qemu.org/Features/vhost-user-ovs-dpdk Here is the rough test guide: 1. start ovs-vswitchd 2. Add two ovs vhost-user port, say vhost0 and vhost1 3. Start a VM1 to connect to vhost0. Here is my example: $ $QEMU -enable-kvm -m 1024 -smp 4 \ -chardev socket,id=char0,path=/var/run/openvswitch/vhost0 \ -netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \ -device virtio-net-pci,netdev=mynet1,mac=52:54:00:12:34:58 \ -object memory-backend-file,id=mem,size=1024M,mem-path=$HOME/hugetlbfs,share=on \ -numa node,memdev=mem -mem-prealloc \ -kernel $HOME/iso/vmlinuz -append "root=/dev/sda1" \ -hda fc-19-i386.img \ -monitor telnet::,server,nowait -curses 4. run "ping $host" inside VM1 5. Start VM2 to connect to vhost0, and marking it as the target of live migration (by adding -incoming tcp:0: option) $ $QEMU -enable-kvm -m 1024 -smp 4 \ -chardev socket,id=char0,path=/var/run/openvswitch/vhost1 \ -netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \ -device virtio-net-pci,netdev=mynet1,mac=52:54:00:12:34:58 \ -object memory-backend-file,id=mem,size=1024M,mem-path=$HOME/hugetlbfs,share=on \ -numa node,memdev=mem -mem-prealloc \ -kernel $HOME/iso/vmlinuz -append "root=/dev/sda1" \ -hda fc-19-i386.img \ -monitor telnet::3334,server,nowait -curses \ -incoming tcp:0: 6. connect to VM1 monitor, and start migration: > migrate tcp:0: 7. After a while, you will find that VM1 has been migrated to VM2, and the "ping" command continues running, perfectly. --- Yuanhan Liu (8): vhost: handle VHOST_USER_SET_LOG_BASE request vhost: introduce vhost_log_write vhost: log used vring changes vhost: log vring desc buffer changes vhost: claim that we support GUEST_ANNOUNCE feature vhost: handle VHOST_USER_SEND_RARP request vhost: enable log_shmfd protocol feature vhost: remove duplicate header include doc/guides/rel_notes/release_2_3.rst | 2 + lib/librte_vhost/rte_virtio_net.h | 9 +- lib/librte_vhost/vhost_rxtx.c | 114 + lib/librte_vhost/vhost_user/vhost-net-user.c | 11 +- lib/librte_vhost/vhost_user/vhost-net-user.h | 7 ++ lib/librte_vhost/vhost_user/virtio-net-user.c | 174 +- lib/librte_vhost/vhost_user/virtio-net-user.h | 8 +- lib/librte_vhost/virtio-net.c | 5 + 8 files changed, 299 insertions(+), 31 deletions(-) -- 1.9.0
[dpdk-dev] [PATCH v3 1/8] vhost: handle VHOST_USER_SET_LOG_BASE request
VHOST_USER_SET_LOG_BASE request is used to tell the backend (dpdk vhost-user) where we should log dirty pages, and how big the log buffer is. This request introduces a new payload: typedef struct VhostUserLog { uint64_t mmap_size; uint64_t mmap_offset; } VhostUserLog; Also, a fd is delivered from QEMU by ancillary data. With those info given, an area of memory is mmaped, assigned to dev->log_base, for logging dirty pages. Signed-off-by: Yuanhan Liu Signed-off-by: Victor Kaplansky Tested-by: Pavel Fedin --- lib/librte_vhost/rte_virtio_net.h | 4 ++- lib/librte_vhost/vhost_user/vhost-net-user.c | 7 ++-- lib/librte_vhost/vhost_user/vhost-net-user.h | 6 lib/librte_vhost/vhost_user/virtio-net-user.c | 48 +++ lib/librte_vhost/vhost_user/virtio-net-user.h | 1 + 5 files changed, 63 insertions(+), 3 deletions(-) diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 10dcb90..8acee02 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -129,7 +129,9 @@ struct virtio_net { charifname[IF_NAME_SZ]; /**< Name of the tap device or socket path. */ uint32_tvirt_qp_nb; /**< number of queue pair we have allocated */ void*priv; /**< private context */ - uint64_treserved[64]; /**< Reserve some spaces for future extension. */ + uint64_tlog_size; /**< Size of log area */ + uint64_tlog_base; /**< Where dirty pages are logged */ + uint64_treserved[62]; /**< Reserve some spaces for future extension. */ struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; /**< Contains all virtqueue information. */ } __rte_cache_aligned; diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c index 8b7a448..32ad6f6 100644 --- a/lib/librte_vhost/vhost_user/vhost-net-user.c +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c @@ -388,9 +388,12 @@ vserver_message_handler(int connfd, void *dat, int *remove) break; case VHOST_USER_SET_LOG_BASE: - RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); - break; + user_set_log_base(ctx, &msg); + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(connfd, &msg); + break; case VHOST_USER_SET_LOG_FD: close(msg.fds[0]); RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h index 38637cc..6d252a3 100644 --- a/lib/librte_vhost/vhost_user/vhost-net-user.h +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h @@ -83,6 +83,11 @@ typedef struct VhostUserMemory { VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; } VhostUserMemory; +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + typedef struct VhostUserMsg { VhostUserRequest request; @@ -97,6 +102,7 @@ typedef struct VhostUserMsg { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; + VhostUserLoglog; } payload; int fds[VHOST_MEMORY_MAX_NREGIONS]; } __attribute((packed)) VhostUserMsg; diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c index 2934d1c..0f3b163 100644 --- a/lib/librte_vhost/vhost_user/virtio-net-user.c +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c @@ -365,3 +365,51 @@ user_set_protocol_features(struct vhost_device_ctx ctx, dev->protocol_features = protocol_features; } + +int +user_set_log_base(struct vhost_device_ctx ctx, +struct VhostUserMsg *msg) +{ + struct virtio_net *dev; + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + dev = get_device(ctx); + if (!dev) + return -1; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* +* mmap from 0 to workaround a hugepage mmap bug: mmap will +* fail when offset is
[dpdk-dev] [PATCH v3 2/8] vhost: introduce vhost_log_write
Introduce vhost_log_write() helper function to log the dirty pages we touched. Page size is harded code to 4096 (VHOST_LOG_PAGE), and each log is presented by 1 bit. Therefore, vhost_log_write() simply finds the right bit for related page we are gonna change, and set it to 1. dev->log_base denotes the start of the dirty page bitmap. Signed-off-by: Yuanhan Liu Signed-off-by: Victor Kaplansky Tested-by: Pavel Fedin --- v3: - move the two functions inside vhost_rxtx.c - add a memory barrier before logging, to make sure guest memory updates are committed before logging. - Fix an off-by-one bug --- lib/librte_vhost/rte_virtio_net.h | 2 ++ lib/librte_vhost/vhost_rxtx.c | 29 + 2 files changed, 31 insertions(+) diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 8acee02..2c891ae 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -40,6 +40,7 @@ */ #include +#include #include #include #include @@ -205,6 +206,7 @@ gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa) return vhost_va; } + /** * Disable features in feature_mask. Returns 0 on success. */ diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index bbf3fac..d485364 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -42,6 +42,35 @@ #include "vhost-net.h" #define MAX_PKT_BURST 32 +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) -- 1.9.0
[dpdk-dev] [PATCH v3 3/8] vhost: log used vring changes
Every time we update virtio used ring, we need to log it. And it's been done by a new vhost_log_write() wrapper, vhost_log_used_vring(). Signed-off-by: Yuanhan Liu Signed-off-by: Victor Kaplansky Tested-by: Pavel Fedin --- v3: remove the unnecessary var "addr" iat vhost_log_used_vring() --- lib/librte_vhost/rte_virtio_net.h | 3 +- lib/librte_vhost/vhost_rxtx.c | 77 +++ lib/librte_vhost/virtio-net.c | 4 ++ 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 2c891ae..4a2303a 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -91,7 +91,8 @@ struct vhost_virtqueue { int callfd; /**< Used to notify the guest (trigger interrupt). */ int kickfd; /**< Currently unused as polling mode is enabled. */ int enabled; - uint64_treserved[16]; /**< Reserve some spaces for future extension. */ + uint64_tlog_guest_addr; /**< Physical address of used ring, for logging */ + uint64_treserved[15]; /**< Reserve some spaces for future extension. */ struct buf_vector buf_vec[BUF_VECTOR_MAX];/**< for scatter RX. */ } __rte_cache_aligned; diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index d485364..9f7b1f8 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -72,6 +72,13 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) } } +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, +uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) { @@ -158,6 +165,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, uint32_t offset = 0, vb_offset = 0; uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0; uint8_t hdr = 0, uncompleted_pkt = 0; + uint16_t idx; /* Get descriptor from available ring */ desc = &vq->desc[head[packet_success]]; @@ -229,16 +237,18 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, } /* Update used ring with desc information */ - vq->used->ring[res_cur_idx & (vq->size - 1)].id = - head[packet_success]; + idx = res_cur_idx & (vq->size - 1); + vq->used->ring[idx].id = head[packet_success]; /* Drop the packet if it is uncompleted */ if (unlikely(uncompleted_pkt == 1)) - vq->used->ring[res_cur_idx & (vq->size - 1)].len = - vq->vhost_hlen; + vq->used->ring[idx].len = vq->vhost_hlen; else - vq->used->ring[res_cur_idx & (vq->size - 1)].len = - pkt_len + vq->vhost_hlen; + vq->used->ring[idx].len = pkt_len + vq->vhost_hlen; + + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[idx]), + sizeof(vq->used->ring[idx])); res_cur_idx++; packet_success++; @@ -265,6 +275,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, *(volatile uint16_t *)&vq->used->idx += count; vq->last_used_idx = res_end_idx; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); /* flush used->idx update before we read avail->flags. */ rte_mb(); @@ -294,6 +307,7 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id, uint32_t seg_avail; uint32_t vb_avail; uint32_t cpy_len, entry_len; + uint16_t idx; if (pkt == NULL) return 0; @@ -331,16 +345,18 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id, entry_len = vq->vhost_hlen; if (vb_avail == 0) { - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; + uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx; + + if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) { + idx = cur_idx & (vq->size - 1); - if ((vq->desc[desc_idx].flags - & VRING_DESC_F_NEXT) == 0) { /* Update used ring with desc information */ - vq->used->ring[cur_idx &
[dpdk-dev] [PATCH v3 4/8] vhost: log vring desc buffer changes
Every time we copy a buf to vring desc, we need to log it. Signed-off-by: Yuanhan Liu Signed-off-by: Victor Kaplansky Tested-by: Pavel Fedin --- lib/librte_vhost/vhost_rxtx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index 9f7b1f8..9c6cc00 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -97,7 +97,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) { struct vhost_virtqueue *vq; - struct vring_desc *desc; + struct vring_desc *desc, *hdr_desc; struct rte_mbuf *buff; /* The virtio_hdr is initialised to 0. */ struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; @@ -179,6 +179,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, /* Copy virtio_hdr to packet and increment buffer address */ buff_hdr_addr = buff_addr; + hdr_desc = desc; /* * If the descriptors are chained the header and data are @@ -203,6 +204,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset), rte_pktmbuf_mtod_offset(buff, const void *, offset), len_to_cpy); + vhost_log_write(dev, desc->addr + vb_offset, len_to_cpy); PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset), len_to_cpy, 0); @@ -258,6 +260,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); + vhost_log_write(dev, hdr_desc->addr, vq->vhost_hlen); PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); @@ -335,6 +338,7 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id, rte_memcpy((void *)(uintptr_t)vb_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); + vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, vq->vhost_hlen); PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); @@ -379,6 +383,8 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id, rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset), cpy_len); + vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr + vb_offset, + cpy_len); PRINT_PACKET(dev, (uintptr_t)(vb_addr + vb_offset), -- 1.9.0
[dpdk-dev] [PATCH v3 5/8] vhost: claim that we support GUEST_ANNOUNCE feature
It's actually a feature already enabled in Linux kernel (since v3.5). What we need to do is simply to claim that we support such feature, and nothing else. With that, the guest will send an ARP message after live migration to notify the switches about the new location of migrated VM. Signed-off-by: Yuanhan Liu Tested-by: Pavel Fedin --- lib/librte_vhost/virtio-net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c index 03044f6..0ba5045 100644 --- a/lib/librte_vhost/virtio-net.c +++ b/lib/librte_vhost/virtio-net.c @@ -74,6 +74,7 @@ static struct virtio_net_config_ll *ll_root; #define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ (VHOST_SUPPORTS_MQ)| \ (1ULL << VIRTIO_F_VERSION_1) | \ (1ULL << VHOST_F_LOG_ALL) | \ -- 1.9.0
[dpdk-dev] [PATCH v3 6/8] vhost: handle VHOST_USER_SEND_RARP request
While in former patch we enabled GUEST_ANNOUNCE feature, so that the guest OS will broadcast a GARP message after migration to notify the switch about the new location of migrated VM, the thing is that GUEST_ANNOUNCE is enabled since kernel v3.5 only. For older kernel, VHOST_USER_SEND_RARP request comes to rescue. The payload of this new request is the mac address of the migrated VM, with that, we could construct a RARP message, and then broadcast it to host interfaces. That's how this patch works: - list all interfaces, with the help of SIOCGIFCONF ioctl command - construct an RARP message and broadcast it Cc: Thibaut Collet Signed-off-by: Yuanhan Liu --- Note that this patch did take effect in my test: - it indeed updated target vswitch's mac learning table. (with the "ovs fdb/show bridge" command) - the ping request packets after migration were indeeded flowed to the target (but not the source) host's vswitch. (with tcpdump command) However, I still saw ping lost. I asked help from Thibaut, the original author of the VHOST_USER_SEND_RARP request, he suggested that it might be an issue of my network topo, or ovs settings, which is likely, regarding to what I observed above. Anyway, I'd like to send this out, hopefully someone knows what's wrong there if there any. In the meantime, I will do more debugs. --- lib/librte_vhost/vhost_user/vhost-net-user.c | 4 + lib/librte_vhost/vhost_user/vhost-net-user.h | 1 + lib/librte_vhost/vhost_user/virtio-net-user.c | 125 ++ lib/librte_vhost/vhost_user/virtio-net-user.h | 5 +- 4 files changed, 134 insertions(+), 1 deletion(-) diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c index 32ad6f6..cb18396 100644 --- a/lib/librte_vhost/vhost_user/vhost-net-user.c +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c @@ -100,6 +100,7 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", }; /** @@ -437,6 +438,9 @@ vserver_message_handler(int connfd, void *dat, int *remove) case VHOST_USER_SET_VRING_ENABLE: user_set_vring_enable(ctx, &msg.payload.state); break; + case VHOST_USER_SEND_RARP: + user_send_rarp(&msg); + break; default: break; diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h index 6d252a3..e3bb413 100644 --- a/lib/librte_vhost/vhost_user/vhost-net-user.h +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h @@ -67,6 +67,7 @@ typedef enum VhostUserRequest { VHOST_USER_SET_PROTOCOL_FEATURES = 16, VHOST_USER_GET_QUEUE_NUM = 17, VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, VHOST_USER_MAX } VhostUserRequest; diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c index 0f3b163..cda330d 100644 --- a/lib/librte_vhost/vhost_user/virtio-net-user.c +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c @@ -34,11 +34,18 @@ #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -413,3 +420,121 @@ user_set_log_base(struct vhost_device_ctx ctx, return 0; } + +#define RARP_BUF_SIZE 64 + +static void +make_rarp_packet(uint8_t *buf, uint8_t *mac) +{ + struct ether_header *eth_hdr; + struct ether_arp *rarp; + + /* Ethernet header. */ + eth_hdr = (struct ether_header *)buf; + memset(ð_hdr->ether_dhost, 0xff, ETH_ALEN); + memcpy(ð_hdr->ether_shost, mac, ETH_ALEN); + eth_hdr->ether_type = htons(ETH_P_RARP); + + /* RARP header. */ + rarp = (struct ether_arp *)(eth_hdr + 1); + rarp->ea_hdr.ar_hrd = htons(ARPHRD_ETHER); + rarp->ea_hdr.ar_pro = htons(ETHERTYPE_IP); + rarp->ea_hdr.ar_hln = ETH_ALEN; + rarp->ea_hdr.ar_pln = 4; + rarp->ea_hdr.ar_op = htons(ARPOP_RREQUEST); + + memcpy(&rarp->arp_sha, mac, ETH_ALEN); + memset(&rarp->arp_spa, 0x00, 4); + memcpy(&rarp->arp_tha, mac, 6); + memset(&rarp->arp_tpa, 0x00, 4); +} + + +static void +send_rarp(const char *ifname, uint8_t *rarp) +{ + int fd; + struct ifreq ifr; + struct sockaddr_ll addr; + + fd = socket(AF_PACKET, SOCK_RAW, 0); + if (fd < 0) { + perror("socket failed"); + return; + } + + memset(&ifr, 0, sizeof(struct ifreq)); + strncpy(ifr.ifr_name, ifname, IFNAMSIZ); + if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) { + perror("failed t
[dpdk-dev] [PATCH v3 7/8] vhost: enable log_shmfd protocol feature
To claim that we support vhost-user live migration support: SET_LOG_BASE request will be send only when this feature flag is set. Besides this flag, we actually need another feature flag set to make vhost-user live migration work: VHOST_F_LOG_ALL. Which, however, has been enabled long time ago. Signed-off-by: Yuanhan Liu Tested-by: Pavel Fedin --- doc/guides/rel_notes/release_2_3.rst | 2 ++ lib/librte_vhost/vhost_user/virtio-net-user.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..f2c9e41 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -4,6 +4,8 @@ DPDK Release 2.3 New Features +* **Added vhost-user live migration support.** + Resolved Issues --- diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h index 1e9ff9a..28213f3 100644 --- a/lib/librte_vhost/vhost_user/virtio-net-user.h +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h @@ -38,9 +38,11 @@ #include "vhost-net-user.h" #define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1 #define VHOST_USER_PROTOCOL_F_RARP 2 #define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ +(1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ (1ULL << VHOST_USER_PROTOCOL_F_RARP)) int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *); -- 1.9.0
[dpdk-dev] [PATCH v3 8/8] vhost: remove duplicate header include
unistd.h has been included twice; remove one. Signed-off-by: Yuanhan Liu --- lib/librte_vhost/vhost_user/virtio-net-user.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c index cda330d..4270c98 100644 --- a/lib/librte_vhost/vhost_user/virtio-net-user.c +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include -- 1.9.0
[dpdk-dev] [PATCH] ixgbe: Fix disable interrupt twice
Currently, ixgbe vf and pf will disable interrupte twice in stop stage and uninit stage. It will cause an error: testpmd> quit Shutting down port 0... Stopping ports... Done Closing ports... EAL: Error disabling MSI-X interrupts for fd 26 Done Becasue the interrupt already been disabled in stop stage. Since it is enabled in init stage, better remove from stop stage. Fixes: 0eb609239efd ("ixgbe: enable Rx queue interrupts for PF and VF") Signed-off-by: Michael Qiu --- drivers/net/ixgbe/ixgbe_ethdev.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 4c4c6df..a561f8d 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -2192,9 +2192,6 @@ ixgbe_dev_stop(struct rte_eth_dev *dev) /* disable interrupts */ ixgbe_disable_intr(hw); - /* disable intr eventfd mapping */ - rte_intr_disable(intr_handle); - /* reset the NIC */ ixgbe_pf_reset_hw(hw); hw->adapter_stopped = 0; @@ -3898,9 +3895,6 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev) ixgbe_dev_clear_queues(dev); - /* disable intr eventfd mapping */ - rte_intr_disable(intr_handle); - /* Clean datapath event and queue/vec mapping */ rte_intr_efd_disable(intr_handle); if (intr_handle->intr_vec != NULL) { -- 1.9.3
[dpdk-dev] [PATCH v2] ixgbe: Fix disable interrupt twice
Currently, ixgbe vf and pf will disable interrupt twice in stop stage and uninit stage. It will cause an error: testpmd> quit Shutting down port 0... Stopping ports... Done Closing ports... EAL: Error disabling MSI-X interrupts for fd 26 Done Becasue the interrupt already been disabled in stop stage. Since it is enabled in init stage, better remove from stop stage. Fixes: 0eb609239efd ("ixgbe: enable Rx queue interrupts for PF and VF") Signed-off-by: Michael Qiu --- v2 --> v1: fix error in commit log word "interrupte" drivers/net/ixgbe/ixgbe_ethdev.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 4c4c6df..a561f8d 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -2192,9 +2192,6 @@ ixgbe_dev_stop(struct rte_eth_dev *dev) /* disable interrupts */ ixgbe_disable_intr(hw); - /* disable intr eventfd mapping */ - rte_intr_disable(intr_handle); - /* reset the NIC */ ixgbe_pf_reset_hw(hw); hw->adapter_stopped = 0; @@ -3898,9 +3895,6 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev) ixgbe_dev_clear_queues(dev); - /* disable intr eventfd mapping */ - rte_intr_disable(intr_handle); - /* Clean datapath event and queue/vec mapping */ rte_intr_efd_disable(intr_handle); if (intr_handle->intr_vec != NULL) { -- 1.9.3
[dpdk-dev] [PATCH v5 10/11] virtio: pci: add dummy func definition for in/outb for non-x86 arch
On Tue, Jan 19, 2016 at 05:16:11PM +0530, Santosh Shukla wrote: > For non-x86 arch, Compiler will throw build error for in/out apis. Including > dummy api function so to pass build. > > Note that: For virtio to work for non-x86 arch - RTE_EAL_VFIO is the only > supported method. RTE_EAL_IGB_UIO is not supported for non-x86 arch. > > So, Virtio support for arch and supported interface by that arch: > > ARCH IGB_UIOVFIO > x86 Y Y > ARM64 N/A Y > PPC_64N/A Y (Not tested but likely should work, as > vfio is > arch independent) > > Note: Applicable for virtio spec 0.95 > > Signed-off-by: Santosh Shukla > --- > drivers/net/virtio/virtio_pci.h | 46 > +++ > 1 file changed, 46 insertions(+) > > diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h > index f550d22..b88f9ec 100644 > --- a/drivers/net/virtio/virtio_pci.h > +++ b/drivers/net/virtio/virtio_pci.h > @@ -46,6 +46,7 @@ > #endif > > #include > +#include "virtio_logs.h" > > struct virtqueue; > > @@ -320,6 +321,51 @@ outl_p(unsigned int data, unsigned int port) > } > #endif > > +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_I686) && \ > + defined(RTE_EXEC_ENV_LINUXAPP) > +static inline uint8_t > +inb(unsigned long addr __rte_unused) > +{ > + PMD_INIT_LOG(ERR, "inb() not supported for this RTE_ARCH\n"); > + return 0; > +} The whole port read/write stuff is getting messy here: we not only have to care the FreeBSD and Linux difference, but also the x86 and non-x86 difference. And you just added yet another vfio layer. First of all, they are not belong here (virtio_pci.h). A new place like virtio_io.h sounds much better to me. Therefore, I'd suggest you to put all those stuff there, like the one I have just cooked up: #ifndef __VIRTIO_IO_H__ #if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) #ifdef __FreeBSD__ #include #include #define outb_p outb #define outw_p outw #define outl_p outl #else #include #endif #else /* ! X86 */ static inline uint8_t inb(unsigned long addr __rte_unused) { PMD_INIT_LOG(ERR, "inb() not supported for this RTE_ARCH\n"); return 0; } #endif /* X86 */ /* And put the vfio io port read/write here */ #endif /* __VIRTIO_IO_H__ */ Note that you may need squash patch 4 (build fix for sys/io.h ...) here. They both resolve one thing: to make it build on non-x86 platforms. Another minor note is that while you are trying this way, I'd suggest you to make a patch to introduce virtio_io.h, and then make another patch to fix build errors on non-x86 platforms. Another generic comment about this patchset is that it VERY okay to include several components change in one set, but putting them in order helps review a lot. Say, this patch set has dependence on VFIO stuff, therefore, it'd be much better __IF__ you can put all VFIO related patches first, and then virtio related patches follows, but not in an interleaved way you did. If, for somereason, you can't do that, you should at least try to minimise the chance of interleave. --yliu
[dpdk-dev] [PATCH 0/8] support E-tag offloading and forwarding on Intel X550 NIC
This patch set adds the support of E-tag offloading and forwarding on X550. The offloading means E-tag can be inserted and stripped by HW. And E-tag packets can be recognized and forwarded to specific pools based on GRP and E-CID_base in E-tag. Wenzhuo Lu (8): ixgbe: select pool by MAC when using double VLAN lib/librte_ether: support l2 tunnel config ixgbe: support l2 tunnel config app/testpmd: add CLIs for l2 tunnel config lib/librte_ether: support new l2 tunnel operation ixgbe: support l2 tunnel operation app/testpmd: add CLIs for E-tag operation doc: add release note for E-tag app/test-pmd/cmdline.c | 599 +++ doc/guides/rel_notes/release_2_3.rst | 6 + drivers/net/ixgbe/ixgbe_ethdev.c | 507 + lib/librte_ether/rte_eth_ctrl.h | 9 + lib/librte_ether/rte_ethdev.c| 239 ++ lib/librte_ether/rte_ethdev.h| 288 + 6 files changed, 1648 insertions(+) -- 1.9.3
[dpdk-dev] [PATCH 1/8] ixgbe: select pool by MAC when using double VLAN
On X550, as required by datasheet, E-tag packets are not expected when double VLAN are used. So modify the register PFVTCTL after enabling double VLAN to select pool by MAC but not MAC or E-tag. Signed-off-by: Wenzhuo Lu --- drivers/net/ixgbe/ixgbe_ethdev.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 4c4c6df..83df0c0 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -138,6 +138,8 @@ #define IXGBE_CYCLECOUNTER_MASK 0xULL +#define IXGBE_VT_CTL_POOLING_MODE_MASK 0x0003 + static int eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev); static int eth_ixgbe_dev_uninit(struct rte_eth_dev *eth_dev); static int ixgbe_dev_configure(struct rte_eth_dev *dev); @@ -1725,6 +1727,14 @@ ixgbe_vlan_hw_extend_enable(struct rte_eth_dev *dev) ctrl |= IXGBE_EXTENDED_VLAN; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl); + /* Clear pooling mode of PFVTCTL. It's required by X550. */ + if (hw->mac.type == ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_X550EM_x) { + ctrl = IXGBE_READ_REG(hw, IXGBE_VT_CTL); + ctrl &= ~IXGBE_VT_CTL_POOLING_MODE_MASK; + IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, ctrl); + } + /* * VET EXT field in the EXVET register = 0x8100 by default * So no need to change. Same to VT field of DMATXCTL register -- 1.9.3
[dpdk-dev] [PATCH 2/8] lib/librte_ether: support l2 tunnel config
Add functions to support l2 tunnel configuration. The support includes ether type modification and the tunnel support enabling/disabling. Ether type modification means modifying the ether type of a specific type of tunnel. So the packet with this ether type will be parsed as this type of tunnel. Enabling/disabling a tunnel support means enabling/disabling the ability of parsing the specific type of tunnel. This ability should be enabled before we enable filtering, forwarding, offloading for this specific type of tunnel. Only support e-tag tunnel now. Signed-off-by: Wenzhuo Lu --- lib/librte_ether/rte_eth_ctrl.h | 9 + lib/librte_ether/rte_ethdev.c | 61 ++ lib/librte_ether/rte_ethdev.h | 84 + 3 files changed, 154 insertions(+) diff --git a/lib/librte_ether/rte_eth_ctrl.h b/lib/librte_ether/rte_eth_ctrl.h index ce224ad..09af6fb 100644 --- a/lib/librte_ether/rte_eth_ctrl.h +++ b/lib/librte_ether/rte_eth_ctrl.h @@ -804,6 +804,15 @@ struct rte_eth_hash_filter_info { } info; }; +/** + * l2 tunnel type. + */ +enum rte_eth_l2_tunnel_type { + RTE_L2_TUNNEL_TYPE_NONE = 0, + RTE_L2_TUNNEL_TYPE_E_TAG, + RTE_L2_TUNNEL_TYPE_MAX, +}; + #ifdef __cplusplus } #endif diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c index ed971b4..1b90e09 100644 --- a/lib/librte_ether/rte_ethdev.c +++ b/lib/librte_ether/rte_ethdev.c @@ -3239,3 +3239,64 @@ rte_eth_copy_pci_info(struct rte_eth_dev *eth_dev, struct rte_pci_device *pci_de eth_dev->data->numa_node = pci_dev->numa_node; eth_dev->data->drv_name = pci_dev->driver->name; } + +int +rte_eth_dev_l2_tunnel_eth_type_conf(uint8_t port_id, + struct rte_eth_l2_tunnel *l2_tunnel) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if (l2_tunnel == NULL) { + RTE_PMD_DEBUG_TRACE("Invalid l2_tunnel parameter\n"); + return -EINVAL; + } + + if (l2_tunnel->l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_eth_type_conf, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_eth_type_conf)(dev, l2_tunnel); +} + +int +rte_eth_dev_l2_tunnel_enable(uint8_t port_id, +enum rte_eth_l2_tunnel_type l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_enable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_enable)(dev, l2_tunnel_type); +} + +int +rte_eth_dev_l2_tunnel_disable(uint8_t port_id, + enum rte_eth_l2_tunnel_type l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_disable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_disable)(dev, l2_tunnel_type); +} diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h index bada8ad..9b594b8 100644 --- a/lib/librte_ether/rte_ethdev.h +++ b/lib/librte_ether/rte_ethdev.h @@ -957,6 +957,14 @@ TAILQ_HEAD(rte_eth_dev_cb_list, rte_eth_dev_callback); } \ } while (0) +/** + * l2 tunnel configuration. + */ +struct rte_eth_l2_tunnel { + enum rte_eth_l2_tunnel_type l2_tunnel_type; + uint16_t ether_type; +}; + /* * Definitions of all functions exported by an Ethernet driver through the * the generic structure of type *eth_dev_ops* supplied in the *rte_eth_dev* @@ -1261,6 +1269,20 @@ typedef int (*eth_set_eeprom_t)(struct rte_eth_dev *dev, struct rte_dev_eeprom_info *info); /**< @internal Program eeprom data */ +typedef int (*eth_l2_tunnel_eth_type_conf_t) + (struct rte_eth_dev *dev, struct rte_eth_l2_tunnel *l2_tunnel); +/**< @internal config l2 tunnel ether type */ + +typedef int (*eth_l2_tunnel_enable_t) + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +/**< @internal enable a type of l2 tunnel */ + +typedef int (*eth_l2_tunnel_disable_t) + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +/**< @internal disable a type of l2 tunnel */ + #ifd
[dpdk-dev] [PATCH 3/8] ixgbe: support l2 tunnel config
Add support of l2 tunnel configuration. Support modifying ether type of a type of l2 tunnel. Support enabling and disabling the support of a type of l2 tunnel. Only E-tag tunnel is supported now. Signed-off-by: Wenzhuo Lu --- drivers/net/ixgbe/ixgbe_ethdev.c | 140 +++ 1 file changed, 140 insertions(+) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 83df0c0..55ab474 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -139,6 +139,10 @@ #define IXGBE_CYCLECOUNTER_MASK 0xULL #define IXGBE_VT_CTL_POOLING_MODE_MASK 0x0003 +#define DEFAULT_ETAG_ETYPE 0x893f +#define IXGBE_ETAG_ETYPE 0x5084 +#define IXGBE_ETAG_ETYPE_MASK 0x +#define IXGBE_ETAG_ETYPE_VALID 0x8000 static int eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev); static int eth_ixgbe_dev_uninit(struct rte_eth_dev *eth_dev); @@ -339,6 +343,14 @@ static int ixgbe_timesync_read_time(struct rte_eth_dev *dev, struct timespec *timestamp); static int ixgbe_timesync_write_time(struct rte_eth_dev *dev, const struct timespec *timestamp); +static int ixgbe_dev_l2_tunnel_eth_type_conf + (struct rte_eth_dev *dev, struct rte_eth_l2_tunnel *l2_tunnel); +static int ixgbe_dev_l2_tunnel_enable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_disable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); /* * Define VF Stats MACRO for Non "cleared on read" register @@ -497,6 +509,9 @@ static const struct eth_dev_ops ixgbe_eth_dev_ops = { .timesync_adjust_time = ixgbe_timesync_adjust_time, .timesync_read_time = ixgbe_timesync_read_time, .timesync_write_time = ixgbe_timesync_write_time, + .l2_tunnel_eth_type_conf = ixgbe_dev_l2_tunnel_eth_type_conf, + .l2_tunnel_enable= ixgbe_dev_l2_tunnel_enable, + .l2_tunnel_disable = ixgbe_dev_l2_tunnel_disable, }; /* @@ -6201,6 +6216,131 @@ ixgbe_dev_get_dcb_info(struct rte_eth_dev *dev, return 0; } +/* Update e-tag ether type */ +static int +ixgbe_update_e_tag_eth_type(struct ixgbe_hw *hw, + uint16_t ether_type) +{ + uint32_t etag_etype; + + if (hw->mac.type != ixgbe_mac_X550 && + hw->mac.type != ixgbe_mac_X550EM_x) { + return -ENOTSUP; + } + + etag_etype = IXGBE_READ_REG(hw, IXGBE_ETAG_ETYPE); + etag_etype &= ~IXGBE_ETAG_ETYPE_MASK; + etag_etype |= ether_type; + IXGBE_WRITE_REG(hw, IXGBE_ETAG_ETYPE, etag_etype); + IXGBE_WRITE_FLUSH(hw); + + return 0; +} + +/* Config l2 tunnel ether type */ +static int +ixgbe_dev_l2_tunnel_eth_type_conf(struct rte_eth_dev *dev, + struct rte_eth_l2_tunnel *l2_tunnel) +{ + int ret = 0; + struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); + + if (l2_tunnel == NULL) + return -EINVAL; + + switch (l2_tunnel->l2_tunnel_type) { + case RTE_L2_TUNNEL_TYPE_E_TAG: + ret = ixgbe_update_e_tag_eth_type(hw, l2_tunnel->ether_type); + break; + default: + PMD_DRV_LOG(ERR, "Invalid tunnel type"); + ret = -1; + break; + } + + return ret; +} + +/* Enable e-tag tunnel */ +static int +ixgbe_e_tag_enable(struct ixgbe_hw *hw) +{ + uint32_t etag_etype; + + if (hw->mac.type != ixgbe_mac_X550 && + hw->mac.type != ixgbe_mac_X550EM_x) { + return -ENOTSUP; + } + + etag_etype = IXGBE_READ_REG(hw, IXGBE_ETAG_ETYPE); + etag_etype |= IXGBE_ETAG_ETYPE_VALID; + IXGBE_WRITE_REG(hw, IXGBE_ETAG_ETYPE, etag_etype); + IXGBE_WRITE_FLUSH(hw); + + return 0; +} + +/* Enable l2 tunnel */ +static int +ixgbe_dev_l2_tunnel_enable(struct rte_eth_dev *dev, + enum rte_eth_l2_tunnel_type l2_tunnel_type) +{ + int ret = 0; + struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); + + switch (l2_tunnel_type) { + case RTE_L2_TUNNEL_TYPE_E_TAG: + ret = ixgbe_e_tag_enable(hw); + break; + default: + PMD_DRV_LOG(ERR, "Invalid tunnel type"); + ret = -1; + break; + } + + return ret; +} + +/* Disable e-tag tunnel */ +static int +ixgbe_e_tag_disable(struct ixgbe_hw *hw) +{ + uint32_t etag_etype; + + if (hw->mac.type != ixgbe_mac_X550 && + hw->mac.type != ixgbe_mac_X550EM_x) { + return -ENOTSUP; + } + + etag_etype = IXGBE_READ_REG(hw, IXGBE_ETAG_ETYPE); + etag_etype &= ~IXGBE_ETAG_ETYPE_VALID; + IXGBE_WRITE_REG(hw, IXGB
[dpdk-dev] [PATCH 4/8] app/testpmd: add CLIs for l2 tunnel config
Add CLIs to config ether type of l2 tunnel, and to enable/disable a type of l2 tunnel. Now only e-tag tunnel is supported. Signed-off-by: Wenzhuo Lu --- app/test-pmd/cmdline.c | 259 + 1 file changed, 259 insertions(+) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index 73298c9..d409934 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -9630,6 +9630,261 @@ cmdline_parse_inst_t cmd_mcast_addr = { }, }; +/* l2 tunnel config + * only support E-tag now. + */ + +/* Ether type config */ +struct cmd_config_l2_tunnel_eth_type_result { + cmdline_fixed_string_t port; + cmdline_fixed_string_t config; + cmdline_fixed_string_t all; + uint8_t id; + cmdline_fixed_string_t l2_tunnel; + cmdline_fixed_string_t l2_tunnel_type; + cmdline_fixed_string_t eth_type; + uint16_t eth_type_val; +}; + +cmdline_parse_token_string_t cmd_config_l2_tunnel_eth_type_port = + TOKEN_STRING_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +port, "port"); +cmdline_parse_token_string_t cmd_config_l2_tunnel_eth_type_config = + TOKEN_STRING_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +config, "config"); +cmdline_parse_token_string_t cmd_config_l2_tunnel_eth_type_all_str = + TOKEN_STRING_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +all, "all"); +cmdline_parse_token_num_t cmd_config_l2_tunnel_eth_type_id = + TOKEN_NUM_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +id, UINT8); +cmdline_parse_token_string_t cmd_config_l2_tunnel_eth_type_l2_tunnel = + TOKEN_STRING_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +l2_tunnel, "l2-tunnel"); +cmdline_parse_token_string_t cmd_config_l2_tunnel_eth_type_l2_tunnel_type = + TOKEN_STRING_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +l2_tunnel_type, "E-tag"); +cmdline_parse_token_string_t cmd_config_l2_tunnel_eth_type_eth_type = + TOKEN_STRING_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +eth_type, "ether-type"); +cmdline_parse_token_num_t cmd_config_l2_tunnel_eth_type_eth_type_val = + TOKEN_NUM_INITIALIZER + (struct cmd_config_l2_tunnel_eth_type_result, +eth_type_val, UINT16); + +static uint32_t +str2fdir_l2_tunnel_type(char *string) +{ + uint32_t i = 0; + + static const struct { + char str[32]; + uint32_t type; + } l2_tunnel_type_str[] = { + {"E-tag", RTE_L2_TUNNEL_TYPE_E_TAG}, + }; + + for (i = 0; i < RTE_DIM(l2_tunnel_type_str); i++) { + if (!strcmp(l2_tunnel_type_str[i].str, string)) + return l2_tunnel_type_str[i].type; + } + return RTE_L2_TUNNEL_TYPE_NONE; +} + +/* ether type config for all ports */ +static void +cmd_config_l2_tunnel_eth_type_all_parsed + (void *parsed_result, +__attribute__((unused)) struct cmdline *cl, +__attribute__((unused)) void *data) +{ + struct cmd_config_l2_tunnel_eth_type_result *res = parsed_result; + struct rte_eth_l2_tunnel entry; + portid_t pid; + + entry.l2_tunnel_type = str2fdir_l2_tunnel_type(res->l2_tunnel_type); + entry.ether_type = res->eth_type_val; + + FOREACH_PORT(pid, ports) { + rte_eth_dev_l2_tunnel_eth_type_conf(pid, &entry); + } +} + +cmdline_parse_inst_t cmd_config_l2_tunnel_eth_type_all = { + .f = cmd_config_l2_tunnel_eth_type_all_parsed, + .data = NULL, + .help_str = "port config all l2-tunnel ether-type", + .tokens = { + (void *)&cmd_config_l2_tunnel_eth_type_port, + (void *)&cmd_config_l2_tunnel_eth_type_config, + (void *)&cmd_config_l2_tunnel_eth_type_all_str, + (void *)&cmd_config_l2_tunnel_eth_type_l2_tunnel, + (void *)&cmd_config_l2_tunnel_eth_type_l2_tunnel_type, + (void *)&cmd_config_l2_tunnel_eth_type_eth_type, + (void *)&cmd_config_l2_tunnel_eth_type_eth_type_val, + NULL, + }, +}; + +/* ether type config for a specific port */ +static void +cmd_config_l2_tunnel_eth_type_specific_parsed( + void *parsed_result, + __attribute__((unused)) struct cmdline *cl, + __attribute__((unused)) void *data) +{ + struct cmd_config_l2_tunnel_eth_type_result *res = +parsed_result; + struct rte_eth_l2_tunnel entry; + + if (port_id_is_invalid(res->id, ENABLED_WARN)) + return; + + entry.l2_tunnel_type = str2fdir_l2_tunnel_type(res->l2_tunnel_type); + entry.ether_type = res->eth_type_val; + + rte_eth_dev_l2_tunnel_eth_type_conf(res->id, &e
[dpdk-dev] [PATCH 7/8] app/testpmd: add CLIs for E-tag operation
Add the CLIs to support the E-tag operation. 1, Offloading of E-tag insertion and stripping. 2, Forwarding the E-tag packets to pools based on the GRP and E-CID_base. Signed-off-by: Wenzhuo Lu --- app/test-pmd/cmdline.c | 340 + 1 file changed, 340 insertions(+) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index d409934..df07e60 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -9885,6 +9885,340 @@ cmdline_parse_inst_t cmd_config_l2_tunnel_en_dis_specific = { }, }; +/* E-tag configuration */ + +/* Common result structure for all E-tag configuration */ +struct cmd_config_e_tag_result { + cmdline_fixed_string_t e_tag; + cmdline_fixed_string_t set; + cmdline_fixed_string_t insertion; + cmdline_fixed_string_t stripping; + cmdline_fixed_string_t forwarding; + cmdline_fixed_string_t filter; + cmdline_fixed_string_t add; + cmdline_fixed_string_t del; + cmdline_fixed_string_t on; + cmdline_fixed_string_t off; + cmdline_fixed_string_t on_off; + cmdline_fixed_string_t port_tag_id; + uint32_t port_tag_id_val; + cmdline_fixed_string_t e_tag_id; + uint16_t e_tag_id_val; + cmdline_fixed_string_t dst_pool; + uint8_t dst_pool_val; + cmdline_fixed_string_t port; + uint8_t port_id; +}; + +/* Common CLI fields for all E-tag configuration */ +cmdline_parse_token_string_t cmd_config_e_tag_e_tag = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +e_tag, "E-tag"); +cmdline_parse_token_string_t cmd_config_e_tag_set = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +set, "set"); +cmdline_parse_token_string_t cmd_config_e_tag_insertion = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +insertion, "insertion"); +cmdline_parse_token_string_t cmd_config_e_tag_stripping = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +stripping, "stripping"); +cmdline_parse_token_string_t cmd_config_e_tag_forwarding = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +forwarding, "forwarding"); +cmdline_parse_token_string_t cmd_config_e_tag_filter = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +filter, "filter"); +cmdline_parse_token_string_t cmd_config_e_tag_add = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +add, "add"); +cmdline_parse_token_string_t cmd_config_e_tag_del = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +del, "del"); +cmdline_parse_token_string_t cmd_config_e_tag_on = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +on, "on"); +cmdline_parse_token_string_t cmd_config_e_tag_off = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +off, "off"); +cmdline_parse_token_string_t cmd_config_e_tag_on_off = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +on_off, "on#off"); +cmdline_parse_token_string_t cmd_config_e_tag_port_tag_id = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +port_tag_id, "port-tag-id"); +cmdline_parse_token_num_t cmd_config_e_tag_port_tag_id_val = + TOKEN_NUM_INITIALIZER + (struct cmd_config_e_tag_result, +port_tag_id_val, UINT32); +cmdline_parse_token_string_t cmd_config_e_tag_e_tag_id = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +e_tag_id, "e-tag-id"); +cmdline_parse_token_num_t cmd_config_e_tag_e_tag_id_val = + TOKEN_NUM_INITIALIZER + (struct cmd_config_e_tag_result, +e_tag_id_val, UINT16); +cmdline_parse_token_string_t cmd_config_e_tag_dst_pool = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +dst_pool, "dst-pool"); +cmdline_parse_token_num_t cmd_config_e_tag_dst_pool_val = + TOKEN_NUM_INITIALIZER + (struct cmd_config_e_tag_result, +dst_pool_val, UINT8); +cmdline_parse_token_string_t cmd_config_e_tag_port = + TOKEN_STRING_INITIALIZER + (struct cmd_config_e_tag_result, +port, "port"); +cmdline_parse_token_num_t cmd_config_e_tag_port_id = + TOKEN_NUM_INITIALIZER + (struct cmd_config_e_tag_result, +port_id, UINT8); + +/* E-tag insertion configuration */ +static void +cmd_config_e_tag_insertion_en_parsed( + void *parsed_result, + __attribute__((unused)) struct cmdline *cl, + __attribute__((unused)) void *data) +{ + str
[dpdk-dev] [PATCH 8/8] doc: add release note for E-tag
Signed-off-by: Wenzhuo Lu --- doc/guides/rel_notes/release_2_3.rst | 6 ++ 1 file changed, 6 insertions(+) diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..16bd80b 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -4,6 +4,12 @@ DPDK Release 2.3 New Features +* **Added support for E-tag on X550.** + + * Support E-tag offloading of insertion and stripping. + * Support Forwarding E-tag packets to pools based on +GRP and E-CID_base. + Resolved Issues --- -- 1.9.3
[dpdk-dev] [PATCH 5/8] lib/librte_ether: support new l2 tunnel operation
Add functions to support the new l2 tunnel operation. 1, Insertion and stripping for l2 tunnel tag. 2, Forwarding the packets to a pool based on l2 tunnel tag. Signed-off-by: Wenzhuo Lu --- lib/librte_ether/rte_ethdev.c | 178 lib/librte_ether/rte_ethdev.h | 204 ++ 2 files changed, 382 insertions(+) diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c index 1b90e09..a5a398c 100644 --- a/lib/librte_ether/rte_ethdev.c +++ b/lib/librte_ether/rte_ethdev.c @@ -3300,3 +3300,181 @@ rte_eth_dev_l2_tunnel_disable(uint8_t port_id, -ENOTSUP); return (*dev->dev_ops->l2_tunnel_disable)(dev, l2_tunnel_type); } + +int +rte_eth_dev_l2_tunnel_insertion_enable(uint8_t port_id, + struct rte_eth_l2_tunnel *l2_tunnel) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + if (l2_tunnel == NULL) { + RTE_PMD_DEBUG_TRACE("Invalid l2_tunnel parameter\n"); + return -EINVAL; + } + + if (l2_tunnel->l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_insertion_enable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_insertion_enable)(dev, l2_tunnel); +} + +int +rte_eth_dev_l2_tunnel_insertion_disable(uint8_t port_id, + enum rte_eth_l2_tunnel_type + l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_insertion_disable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_insertion_disable)(dev, + l2_tunnel_type); +} + +int +rte_eth_dev_l2_tunnel_stripping_enable(uint8_t port_id, + enum rte_eth_l2_tunnel_type + l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_stripping_enable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_stripping_enable)(dev, + l2_tunnel_type); +} + +int +rte_eth_dev_l2_tunnel_stripping_disable(uint8_t port_id, +enum rte_eth_l2_tunnel_type +l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_stripping_disable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_stripping_disable)(dev, + l2_tunnel_type); +} + +int +rte_eth_dev_l2_tunnel_forwarding_enable(uint8_t port_id, +enum rte_eth_l2_tunnel_type +l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE("Invalid l2 tunnel type\n"); + return -EINVAL; + } + + dev = &rte_eth_devices[port_id]; + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->l2_tunnel_forwarding_enable, + -ENOTSUP); + return (*dev->dev_ops->l2_tunnel_forwarding_enable)(dev, + l2_tunnel_type); +} + +int +rte_eth_dev_l2_tunnel_forwarding_disable(uint8_t port_id, +enum rte_eth_l2_tunnel_type +l2_tunnel_type) +{ + struct rte_eth_dev *dev; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + + if (l2_tunnel_type >= RTE_L2_TUNNEL_TYPE_MAX) { + RTE_PMD_DEBUG_TRACE
[dpdk-dev] [PATCH 6/8] ixgbe: support l2 tunnel operation
Add support of l2 tunnel operation. Support enabling/disabling l2 tunnel tag insertion/stripping. Support enabling/disabling l2 tunnel packets forwarding. Support adding/deleting forwarding rules for l2 tunnel packets. Only support E-tag now. Signed-off-by: Wenzhuo Lu --- drivers/net/ixgbe/ixgbe_ethdev.c | 357 +++ 1 file changed, 357 insertions(+) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index 55ab474..35e76a2 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -139,10 +139,17 @@ #define IXGBE_CYCLECOUNTER_MASK 0xULL #define IXGBE_VT_CTL_POOLING_MODE_MASK 0x0003 +#define IXGBE_VT_CTL_POOLING_MODE_ETAG 0x0001 #define DEFAULT_ETAG_ETYPE 0x893f #define IXGBE_ETAG_ETYPE 0x5084 #define IXGBE_ETAG_ETYPE_MASK 0x #define IXGBE_ETAG_ETYPE_VALID 0x8000 +#define IXGBE_RAH_ADTYPE 0x4000 +#define IXGBE_RAL_ETAG_FILTER_MASK 0x3fff +#define IXGBE_VMVIR_TAGA_MASK 0x1800 +#define IXGBE_VMVIR_TAGA_ETAG_INSERT 0x0800 +#define IXGBE_VMTIR(_i) (0x00017000 + ((_i) * 4)) /* 64 of these (0-63) */ +#define IXGBE_QDE_STRIP_TAG0x0004 static int eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev); static int eth_ixgbe_dev_uninit(struct rte_eth_dev *eth_dev); @@ -351,6 +358,30 @@ static int ixgbe_dev_l2_tunnel_enable static int ixgbe_dev_l2_tunnel_disable (struct rte_eth_dev *dev, enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_insertion_enable + (struct rte_eth_dev *dev, struct rte_eth_l2_tunnel *l2_tunnel); +static int ixgbe_dev_l2_tunnel_insertion_disable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_stripping_enable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_stripping_disable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_forwarding_enable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_forwarding_disable + (struct rte_eth_dev *dev, +enum rte_eth_l2_tunnel_type l2_tunnel_type); +static int ixgbe_dev_l2_tunnel_filter_add + (struct rte_eth_dev *dev, +struct rte_eth_l2_tunnel *l2_tunnel, +uint32_t pool); +static int ixgbe_dev_l2_tunnel_filter_del + (struct rte_eth_dev *dev, +struct rte_eth_l2_tunnel *l2_tunnel); /* * Define VF Stats MACRO for Non "cleared on read" register @@ -512,6 +543,14 @@ static const struct eth_dev_ops ixgbe_eth_dev_ops = { .l2_tunnel_eth_type_conf = ixgbe_dev_l2_tunnel_eth_type_conf, .l2_tunnel_enable= ixgbe_dev_l2_tunnel_enable, .l2_tunnel_disable = ixgbe_dev_l2_tunnel_disable, + .l2_tunnel_insertion_enable = ixgbe_dev_l2_tunnel_insertion_enable, + .l2_tunnel_insertion_disable = ixgbe_dev_l2_tunnel_insertion_disable, + .l2_tunnel_stripping_enable = ixgbe_dev_l2_tunnel_stripping_enable, + .l2_tunnel_stripping_disable = ixgbe_dev_l2_tunnel_stripping_disable, + .l2_tunnel_forwarding_enable = ixgbe_dev_l2_tunnel_forwarding_enable, + .l2_tunnel_forwarding_disable = ixgbe_dev_l2_tunnel_forwarding_disable, + .l2_tunnel_filter_add = ixgbe_dev_l2_tunnel_filter_add, + .l2_tunnel_filter_del = ixgbe_dev_l2_tunnel_filter_del, }; /* @@ -6341,6 +6380,324 @@ ixgbe_dev_l2_tunnel_disable(struct rte_eth_dev *dev, return ret; } +static int +ixgbe_e_tag_filter_add(struct rte_eth_dev *dev, + struct rte_eth_l2_tunnel *l2_tunnel, + uint32_t pool) +{ + int ret = 0; + struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); + u32 i, rar_entries; + u32 rar_low, rar_high; + + if (hw->mac.type != ixgbe_mac_X550 && + hw->mac.type != ixgbe_mac_X550EM_x) { + return -ENOTSUP; + } + + rar_entries = ixgbe_get_num_rx_addrs(hw); + + for (i = 1; i < rar_entries; i++) { + rar_high = IXGBE_READ_REG(hw, IXGBE_RAH(i)); + if (rar_high & IXGBE_RAH_AV) { + continue; + } else { + ixgbe_set_vmdq(hw, i, pool); + rar_high = IXGBE_RAH_AV | IXGBE_RAH_ADTYPE; + rar_low = l2_tunnel->tunnel_id; + + IXGBE_WRITE_REG(hw, IXGBE_RAL(i), rar_low); + IXGBE_WRITE_REG(hw, IXGBE_RAH(i), rar_high); + + return ret; + } + } + + PMD_INIT_LOG(NOTICE, "The table of E-ta
[dpdk-dev] [PATCH v5 06/11] virtio: vfio: add api support to rd/wr ioport bar
On Tue, Jan 19, 2016 at 05:16:07PM +0530, Santosh Shukla wrote: > For vfio case - Use pread/pwrite api to access virtio > ioport space. > > Applicable for virtio 0.95 spec. > > Signed-off-by: Santosh Shukla > Signed-off-by: Rizwan Ansari > Signed-off-by: Rakesh Krishnamurthy > --- > v4 --> v5: > - Removed unnecessary type casting. > > drivers/net/virtio/virtio_vfio_rw.h | 104 > +++ > 1 file changed, 104 insertions(+) > create mode 100644 drivers/net/virtio/virtio_vfio_rw.h > > diff --git a/drivers/net/virtio/virtio_vfio_rw.h > b/drivers/net/virtio/virtio_vfio_rw.h > new file mode 100644 > index 000..a9ab04d > --- /dev/null > +++ b/drivers/net/virtio/virtio_vfio_rw.h Maybe virtio_vfio.h is enough, or if you have to put a sufix there, _io (instead of _rw) is a better option to me. > + > +/* vfio rd/rw virtio apis */ > +static inline void > +ioport_inb(const struct rte_pci_device *pci_dev, uint8_t reg, uint8_t *val) vfio_inb is a better name; ioport_inb is too generic. --yliu
[dpdk-dev] [PATCH 0/8] support E-tag offloading and forwarding on Intel X550 NIC
Hi, Wenzhuo Better to explain what E-tag is, so that reviewers could known it. Thanks, Michael On 1/29/2016 3:05 PM, Wenzhuo Lu wrote: > This patch set adds the support of E-tag offloading and forwarding > on X550. > The offloading means E-tag can be inserted and stripped by HW. > And E-tag packets can be recognized and forwarded to specific pools > based on GRP and E-CID_base in E-tag. > > Wenzhuo Lu (8): > ixgbe: select pool by MAC when using double VLAN > lib/librte_ether: support l2 tunnel config > ixgbe: support l2 tunnel config > app/testpmd: add CLIs for l2 tunnel config > lib/librte_ether: support new l2 tunnel operation > ixgbe: support l2 tunnel operation > app/testpmd: add CLIs for E-tag operation > doc: add release note for E-tag > > app/test-pmd/cmdline.c | 599 > +++ > doc/guides/rel_notes/release_2_3.rst | 6 + > drivers/net/ixgbe/ixgbe_ethdev.c | 507 + > lib/librte_ether/rte_eth_ctrl.h | 9 + > lib/librte_ether/rte_ethdev.c| 239 ++ > lib/librte_ether/rte_ethdev.h| 288 + > 6 files changed, 1648 insertions(+) >
[dpdk-dev] [PATCH v5 09/11] virtio_pci: do not parse if interface is vfio-noiommu
Two minor nits. Firstly about your title, you should be consistent: sometimes you use virtio_pci, and sometimes you use virtio_pic.h. And for virtio pmd driver, "virtio: " prefix is pretty enough, no need another extra "vfio: " or "pci: " prefix. And the same to your EAL changes. EAL is a bigger, having more components, thus sometimes 2 prefixs are used. And if you are not sure how to add prefix, dig the git history to get the answer. On Tue, Jan 19, 2016 at 05:16:10PM +0530, Santosh Shukla wrote: > If virtio interface attached to vfio-noiommu driver then > do not parse for virtio resource. Instead exit with return 0; > > Note: Applicable for virtio spec 0.95. And this is not necessary: io port stuff is for virtio 0.95 only. virtio 1.0 won't use that, at all. --yliu
[dpdk-dev] [PATCH v5 06/11] virtio: vfio: add api support to rd/wr ioport bar
On Fri, Jan 29, 2016 at 12:37 PM, Yuanhan Liu wrote: > On Tue, Jan 19, 2016 at 05:16:07PM +0530, Santosh Shukla wrote: >> For vfio case - Use pread/pwrite api to access virtio >> ioport space. >> >> Applicable for virtio 0.95 spec. >> >> Signed-off-by: Santosh Shukla >> Signed-off-by: Rizwan Ansari >> Signed-off-by: Rakesh Krishnamurthy >> --- >> v4 --> v5: >> - Removed unnecessary type casting. >> >> drivers/net/virtio/virtio_vfio_rw.h | 104 >> +++ >> 1 file changed, 104 insertions(+) >> create mode 100644 drivers/net/virtio/virtio_vfio_rw.h >> >> diff --git a/drivers/net/virtio/virtio_vfio_rw.h >> b/drivers/net/virtio/virtio_vfio_rw.h >> new file mode 100644 >> index 000..a9ab04d >> --- /dev/null >> +++ b/drivers/net/virtio/virtio_vfio_rw.h > > Maybe virtio_vfio.h is enough, or if you have to put a sufix there, _io > (instead of _rw) is a better option to me. > Ok. >> + >> +/* vfio rd/rw virtio apis */ >> +static inline void >> +ioport_inb(const struct rte_pci_device *pci_dev, uint8_t reg, uint8_t *val) > > vfio_inb is a better name; ioport_inb is too generic. > Ok. > --yliu
[dpdk-dev] [PATCH v7 1/2] tools: Add support for handling built-in kernel modules
On Thu, Jan 28, 2016 at 02:13:53PM +0100, krytarowski at caviumnetworks.com wrote: > From: Kamil Rytarowski > > Currently dpdk_nic_bind.py detects Linux kernel modules via reading > /proc/modules. Built-in ones aren't listed there and therefore they are not > being found by the script. > > Add support for checking built-in modules with parsing the sysfs files. > > This commit obsoletes the /proc/modules parsing approach. > > Signed-off-by: Kamil Rytarowski Acked-by: Yuanhan Liu --yliu
[dpdk-dev] [PATCH v5 09/11] virtio_pci: do not parse if interface is vfio-noiommu
On Fri, Jan 29, 2016 at 12:47 PM, Yuanhan Liu wrote: > Two minor nits. > > Firstly about your title, you should be consistent: sometimes you > use virtio_pci, and sometimes you use virtio_pic.h. And for virtio > pmd driver, "virtio: " prefix is pretty enough, no need another > extra "vfio: " or "pci: " prefix. > > And the same to your EAL changes. EAL is a bigger, having more > components, thus sometimes 2 prefixs are used. And if you are > not sure how to add prefix, dig the git history to get the answer. > > On Tue, Jan 19, 2016 at 05:16:10PM +0530, Santosh Shukla wrote: >> If virtio interface attached to vfio-noiommu driver then >> do not parse for virtio resource. Instead exit with return 0; >> >> Note: Applicable for virtio spec 0.95. > > And this is not necessary: io port stuff is for virtio 0.95 only. > virtio 1.0 won't use that, at all. > Okay, I removed [08/11] patch from v5 series and modified this patch accordingly [1]. So, ignore this patch and pl. review provided link. [1] http://dpdk.org/dev/patchwork/patch/10143/ > --yliu
[dpdk-dev] [PATCH v4 00/14] Add virtio support for arm/arm64
On 1/14/2016 9:29 PM, Santosh Shukla wrote: > Hi, > > This v4 patch uses vfio-noiommu-way to access virtio-net pci interface. > Tested for arm64 thunderX platform. Patch builds for > x86/i386/arm/armv8/thunderX. Tested with testpmd application. Hi Shukla: I would take two week's leave from tomorrow and will not have enough bandwidth to review your code. I think yuanhan will do that. Sorry for the inconvenience. Btw, when you send a new version, could you add --in-reply-to so that all your patches are within one thread?
[dpdk-dev] [PATCH v2 1/2] i40evf: allocate virtchnl cmd buffer for each vf
> -Original Message- > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Jingjing Wu > Sent: Wednesday, January 27, 2016 9:50 AM > To: dev at dpdk.org > Subject: [dpdk-dev] [PATCH v2 1/2] i40evf: allocate virtchnl cmd buffer for > each vf > > Currently, i40evf PMD uses a global static buffer to send virtchnl command to > host driver. It is shared by multi VFs. > This patch changed to allocate virtchnl cmd buffer for each VF. > > Signed-off-by: Jingjing Wu > --- > drivers/net/i40e/i40e_ethdev.h| 2 + > drivers/net/i40e/i40e_ethdev_vf.c | 181 +++--- > > 2 files changed, 74 insertions(+), 109 deletions(-) > > diff --git a/drivers/net/i40e/i40e_ethdev.h > b/drivers/net/i40e/i40e_ethdev.h index 1f9792b..93122ad 100644 > --- a/drivers/net/i40e/i40e_ethdev.h > +++ b/drivers/net/i40e/i40e_ethdev.h > @@ -494,7 +494,9 @@ struct i40e_vf { > bool link_up; > bool vf_reset; > volatile uint32_t pend_cmd; /* pending command not finished yet */ > + uint32_t cmd_retval; /* return value of the cmd response from PF */ > u16 pend_msg; /* flags indicates events from pf not handled yet */ > + uint8_t *aq_resp; /* buffer to store the adminq response from PF */ > > /* VSI info */ > struct i40e_virtchnl_vf_resource *vf_res; /* All VSIs */ diff --git > a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c > index 14d2a50..64e6957 100644 > --- a/drivers/net/i40e/i40e_ethdev_vf.c > +++ b/drivers/net/i40e/i40e_ethdev_vf.c > @@ -103,9 +103,6 @@ enum i40evf_aq_result { > I40EVF_MSG_CMD, /* Read async command result */ > }; > > -/* A share buffer to store the command result from PF driver */ -static > uint8_t cmd_result_buffer[I40E_AQ_BUF_SZ]; > - > static int i40evf_dev_configure(struct rte_eth_dev *dev); static int > i40evf_dev_start(struct rte_eth_dev *dev); static void > i40evf_dev_stop(struct rte_eth_dev *dev); @@ -237,31 +234,39 @@ > i40evf_set_mac_type(struct i40e_hw *hw) } > > /* > - * Parse admin queue message. > - * > - * return value: > - * < 0: meet error > - * 0: read sys msg > - * > 0: read cmd result > + * Read data in admin queue to get msg from pf driver > */ > static enum i40evf_aq_result > -i40evf_parse_pfmsg(struct i40e_vf *vf, > -struct i40e_arq_event_info *event, > -struct i40evf_arq_msg_info *data) > +i40evf_read_pfmsg(struct rte_eth_dev *dev, struct i40evf_arq_msg_info > +*data) > { > - enum i40e_virtchnl_ops opcode = (enum i40e_virtchnl_ops)\ > - rte_le_to_cpu_32(event->desc.cookie_high); > - enum i40e_status_code retval = (enum i40e_status_code)\ > - rte_le_to_cpu_32(event->desc.cookie_low); > - enum i40evf_aq_result ret = I40EVF_MSG_CMD; > + struct i40e_hw *hw = I40E_DEV_PRIVATE_TO_HW(dev->data- > >dev_private); > + struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data- > >dev_private); > + struct i40e_arq_event_info event; > + enum i40e_virtchnl_ops opcode; > + enum i40e_status_code retval; > + int ret; > + enum i40evf_aq_result result = I40EVF_MSG_NON; > > + event.buf_len = data->buf_len; > + event.msg_buf = data->msg; > + ret = i40e_clean_arq_element(hw, &event, NULL); > + /* Can't read any msg from adminQ */ > + if (ret) { > + if (ret == I40E_ERR_ADMIN_QUEUE_NO_WORK) > + result = I40EVF_MSG_NON; > + else > + result = I40EVF_MSG_ERR; > + return result; > + } > + > + opcode = (enum > i40e_virtchnl_ops)rte_le_to_cpu_32(event.desc.cookie_high); > + retval = (enum > +i40e_status_code)rte_le_to_cpu_32(event.desc.cookie_low); > /* pf sys event */ > if (opcode == I40E_VIRTCHNL_OP_EVENT) { > struct i40e_virtchnl_pf_event *vpe = > - (struct i40e_virtchnl_pf_event *)event->msg_buf; > + (struct i40e_virtchnl_pf_event *)event.msg_buf; > > - /* Initialize ret to sys event */ > - ret = I40EVF_MSG_SYS; > + result = I40EVF_MSG_SYS; > switch (vpe->event) { > case I40E_VIRTCHNL_EVENT_LINK_CHANGE: > vf->link_up = > @@ -286,74 +291,17 @@ i40evf_parse_pfmsg(struct i40e_vf *vf, > } > } else { > /* async reply msg on command issued by vf previously */ > - ret = I40EVF_MSG_CMD; > + result = I40EVF_MSG_CMD; > /* Actual data length read from PF */ > - data->msg_len = event->msg_len; > + data->msg_len = event.msg_len; > } > - /* fill the ops and result to notify VF */ > + > data->result = retval; > data->ops = opcode; > > - return ret; > -} > - > -/* > - * Read data in admin queue to get msg from pf driver > - */ > -static enum i40evf_aq_result > -i40evf_read_pfmsg(struct rte_eth_dev *dev, struct i4
[dpdk-dev] [PATCH v5 10/11] virtio: pci: add dummy func definition for in/outb for non-x86 arch
On Fri, Jan 29, 2016 at 12:31 PM, Yuanhan Liu wrote: > On Tue, Jan 19, 2016 at 05:16:11PM +0530, Santosh Shukla wrote: >> For non-x86 arch, Compiler will throw build error for in/out apis. Including >> dummy api function so to pass build. >> >> Note that: For virtio to work for non-x86 arch - RTE_EAL_VFIO is the only >> supported method. RTE_EAL_IGB_UIO is not supported for non-x86 arch. >> >> So, Virtio support for arch and supported interface by that arch: >> >> ARCH IGB_UIOVFIO >> x86 Y Y >> ARM64 N/A Y >> PPC_64N/A Y (Not tested but likely should work, as >> vfio is >> arch independent) >> >> Note: Applicable for virtio spec 0.95 >> >> Signed-off-by: Santosh Shukla >> --- >> drivers/net/virtio/virtio_pci.h | 46 >> +++ >> 1 file changed, 46 insertions(+) >> >> diff --git a/drivers/net/virtio/virtio_pci.h >> b/drivers/net/virtio/virtio_pci.h >> index f550d22..b88f9ec 100644 >> --- a/drivers/net/virtio/virtio_pci.h >> +++ b/drivers/net/virtio/virtio_pci.h >> @@ -46,6 +46,7 @@ >> #endif >> >> #include >> +#include "virtio_logs.h" >> >> struct virtqueue; >> >> @@ -320,6 +321,51 @@ outl_p(unsigned int data, unsigned int port) >> } >> #endif >> >> +#if !defined(RTE_ARCH_X86_64) && !defined(RTE_ARCH_I686) && \ >> + defined(RTE_EXEC_ENV_LINUXAPP) >> +static inline uint8_t >> +inb(unsigned long addr __rte_unused) >> +{ >> + PMD_INIT_LOG(ERR, "inb() not supported for this RTE_ARCH\n"); >> + return 0; >> +} > > The whole port read/write stuff is getting messy here: we not only have > to care the FreeBSD and Linux difference, but also the x86 and non-x86 > difference. And you just added yet another vfio layer. > > First of all, they are not belong here (virtio_pci.h). A new place like > virtio_io.h sounds much better to me. Therefore, I'd suggest you to > put all those stuff there, like the one I have just cooked up: > > > #ifndef __VIRTIO_IO_H__ > > #if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) > > #ifdef __FreeBSD__ > #include > #include > > #define outb_p outb > #define outw_p outw > #define outl_p outl > #else > #include > #endif > > #else /* ! X86 */ > > static inline uint8_t > inb(unsigned long addr __rte_unused) > { > PMD_INIT_LOG(ERR, "inb() not supported for this RTE_ARCH\n"); > return 0; > } > > > > #endif /* X86 */ > > > /* And put the vfio io port read/write here */ > > #endif /* __VIRTIO_IO_H__ */ > > Note that you may need squash patch 4 (build fix for sys/io.h ...) > here. They both resolve one thing: to make it build on non-x86 platforms. > > Another minor note is that while you are trying this way, I'd suggest > you to make a patch to introduce virtio_io.h, and then make another > patch to fix build errors on non-x86 platforms. > Ok. > Another generic comment about this patchset is that it VERY okay to > include several components change in one set, but putting them in > order helps review a lot. > > Say, this patch set has dependence on VFIO stuff, therefore, it'd be > much better __IF__ you can put all VFIO related patches first, and > then virtio related patches follows, but not in an interleaved way > you did. If, for somereason, you can't do that, you should at least > try to minimise the chance of interleave. > I agree that, but this patch series dependent on other patches including virtio 1.0 and then vfio-noiommu, its was difficult for me to keep topic-wise sanity in patch series. V6 will take care patch ordering. Thanks > --yliu
[dpdk-dev] [PATCH v5 09/11] virtio_pci: do not parse if interface is vfio-noiommu
On Fri, Jan 29, 2016 at 12:52:53PM +0530, Santosh Shukla wrote: > On Fri, Jan 29, 2016 at 12:47 PM, Yuanhan Liu > wrote: > > Two minor nits. > > > > Firstly about your title, you should be consistent: sometimes you > > use virtio_pci, and sometimes you use virtio_pic.h. And for virtio > > pmd driver, "virtio: " prefix is pretty enough, no need another > > extra "vfio: " or "pci: " prefix. > > > > And the same to your EAL changes. EAL is a bigger, having more > > components, thus sometimes 2 prefixs are used. And if you are > > not sure how to add prefix, dig the git history to get the answer. > > > > On Tue, Jan 19, 2016 at 05:16:10PM +0530, Santosh Shukla wrote: > >> If virtio interface attached to vfio-noiommu driver then > >> do not parse for virtio resource. Instead exit with return 0; > >> > >> Note: Applicable for virtio spec 0.95. > > > > And this is not necessary: io port stuff is for virtio 0.95 only. > > virtio 1.0 won't use that, at all. > > > > Okay, > > I removed [08/11] patch from v5 series and modified this patch > accordingly [1]. So, ignore this patch and pl. review provided link. > > [1] http://dpdk.org/dev/patchwork/patch/10143/ This patch actually looks good to me, despites the two minor nits. Another note is that while sending just one single update patch, it'd be better if you could link it to the old patch, to make it in the same email thread, otherwise, it's difficult for me to notice that single-alone patch: it's easily get lost. There is another option for that: the git scissors option; you could check the git format-patch man page for more detailed info (by searching "scissors" keyword). I'm just not quite sure Thomas like it or not. --yliu
[dpdk-dev] [PATCH v5 10/11] virtio: pci: add dummy func definition for in/outb for non-x86 arch
On Fri, Jan 29, 2016 at 01:01:02PM +0530, Santosh Shukla wrote: > > Another generic comment about this patchset is that it VERY okay to > > include several components change in one set, but putting them in > > order helps review a lot. > > > > Say, this patch set has dependence on VFIO stuff, therefore, it'd be > > much better __IF__ you can put all VFIO related patches first, and > > then virtio related patches follows, but not in an interleaved way > > you did. If, for somereason, you can't do that, you should at least > > try to minimise the chance of interleave. > > > > I agree that, but this patch series dependent on other patches > including virtio 1.0 and then vfio-noiommu, its was difficult for me > to keep topic-wise sanity in patch series. That would not be an issue to me: just apply the dependence patches first, and build your patches on top of that. You just need mention the dependence info in your cover-letter. --yliu > > V6 will take care patch ordering. Thanks Thanks! --yliu
[dpdk-dev] [PATCH v3 0/4] fix the issue that DPDK takes over virtio device blindly
On Wed, Jan 27, 2016 at 11:21:18PM +0800, Huawei Xie wrote: > v3 changes: > change log message to tell user that the virtio device is skipped > due to it is managed by kernel driver, instead of asking user to > unbind it from kernel driver. > > v2 changes: > Remove unnecessary assignment of NULL to dev->data->mac_addrs > Ajust one comment's position > change LOG level from ERR to INFO > > virtio PMD doesn't set RTE_PCI_DRV_NEED_MAPPING in drv_flags of its > eth_driver. It will try igb_uio and PORT IO in turn to configure > virtio device. Even user in guest VM doesn't want to use virtio for > DPDK, virtio PMD will take over the device blindly. > > The more serious problem is kernel driver is still manipulating the > device, which causes driver conflict. > > This patch checks if there is any kernel driver manipulating the > virtio device before virtio PMD uses port IO to configure the device. Series acked: Acked-by: Yuanhan Liu --yliu
[dpdk-dev] [PATCH v4 0/4] fix performance/cache resource issues with 128-byte cache line targets
This patchset fixes performance/cache resource issues with 128-byte cache line targets found in mbuf and bitmap DPDK libraries Currently, we have two DPDK targets(ThunderX and ppc_64) which are based on 128-bytes cache line size target. This patchset doesn't introduce any performance degradation for 64-bytes cache line size targets. v1..v2 - Introduced new cache macro definitions as Suggested by Konstantin - Reduced the cache alignment requirement for 128-byte cache targets in slow-path data structures to save the memory - Verified x86(a 64byte cacheline target) does not have any impact on these changes by verifying the md5sum of app/test,app/testpmd, app/testacl binaries with or without this patch set v2..v3 revert the cache alignment of rte_ring_debug_stats, rte_mempool_debug_stats structures v3..v4 replaced RTE_CACHE_MIN_LINE_SIZE with RTE_CACHE_LINE_MIN_SIZE as suggested by olivier.matz at 6wind.com For clean git am, "config: cleanup existing RTE_CACHE_LINE_SIZE selection scheme" patch needs to apply first Jerin Jacob (4): eal: Introduce new cache line macro definitions mbuf: fix performance/cache resource issue with 128-byte cache line targets bitmap: optimize for 128-bytes cache line targets cache/slow-path: reduce cache align requirement for 128-byte cache targets app/test/test_mbuf.c | 2 +- lib/librte_eal/common/include/rte_memory.h | 16 .../linuxapp/eal/include/exec-env/rte_kni_common.h | 4 +++- lib/librte_ether/rte_ethdev.h| 4 ++-- lib/librte_mbuf/rte_mbuf.h | 2 +- lib/librte_sched/rte_bitmap.h| 10 +- 6 files changed, 28 insertions(+), 10 deletions(-) -- 2.1.0
[dpdk-dev] [PATCH v4 1/4] eal: Introduce new cache line macro definitions
- RTE_CACHE_LINE_MIN_SIZE(Supported minimum cache line size) - __rte_cache_min_aligned(Force minimum cache line alignment) - RTE_CACHE_LINE_SIZE_LOG2(Express cache line size in terms of log2) Signed-off-by: Jerin Jacob Suggested-by: Konstantin Ananyev --- lib/librte_eal/common/include/rte_memory.h | 16 1 file changed, 16 insertions(+) diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h index 2200d58..f8dbece 100644 --- a/lib/librte_eal/common/include/rte_memory.h +++ b/lib/librte_eal/common/include/rte_memory.h @@ -74,11 +74,27 @@ enum rte_page_sizes { (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE)) /**< Return the first cache-aligned value greater or equal to size. */ +/**< Cache line size in terms of log2 */ +#if RTE_CACHE_LINE_SIZE == 64 +#define RTE_CACHE_LINE_SIZE_LOG2 6 +#elif RTE_CACHE_LINE_SIZE == 128 +#define RTE_CACHE_LINE_SIZE_LOG2 7 +#else +#error "Unsupported cache line size" +#endif + +#define RTE_CACHE_LINE_MIN_SIZE 64 /**< Minimum Cache line size. */ + /** * Force alignment to cache line. */ #define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) +/** + * Force minimum cache line alignment. + */ +#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE) + typedef uint64_t phys_addr_t; /**< Physical address definition. */ #define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1) -- 2.1.0
[dpdk-dev] [PATCH v4 2/4] mbuf: fix performance/cache resource issue with 128-byte cache line targets
No need to split mbuf structure to two cache lines for 128-byte cache line size targets as it can fit on a single 128-byte cache line. Signed-off-by: Jerin Jacob --- app/test/test_mbuf.c | 2 +- lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h | 4 +++- lib/librte_mbuf/rte_mbuf.h| 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/app/test/test_mbuf.c b/app/test/test_mbuf.c index b32bef6..c24cbe0 100644 --- a/app/test/test_mbuf.c +++ b/app/test/test_mbuf.c @@ -930,7 +930,7 @@ test_failing_mbuf_sanity_check(void) static int test_mbuf(void) { - RTE_BUILD_BUG_ON(sizeof(struct rte_mbuf) != RTE_CACHE_LINE_SIZE * 2); + RTE_BUILD_BUG_ON(sizeof(struct rte_mbuf) != RTE_CACHE_LINE_MIN_SIZE * 2); /* create pktmbuf pool if it does not exist */ if (pktmbuf_pool == NULL) { diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h index 815abd6..fef914f 100644 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -68,6 +68,8 @@ */ #define RTE_KNI_NAMESIZE 32 +#define RTE_CACHE_LINE_MIN_SIZE 64 + /* * Request id. */ @@ -118,7 +120,7 @@ struct rte_kni_mbuf { uint16_t data_len; /**< Amount of data in segment buffer. */ /* fields on second cache line */ - char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); void *pool; void *next; }; diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index f234ac9..c973e9b 100644 --- a/lib/librte_mbuf/rte_mbuf.h +++ b/lib/librte_mbuf/rte_mbuf.h @@ -814,7 +814,7 @@ struct rte_mbuf { uint16_t vlan_tci_outer; /**< Outer VLAN Tag Control Identifier (CPU order) */ /* second cache line - fields only used in slow path or on TX */ - MARKER cacheline1 __rte_cache_aligned; + MARKER cacheline1 __rte_cache_min_aligned; union { void *userdata; /**< Can be used for external metadata */ -- 2.1.0
[dpdk-dev] [PATCH v4 3/4] bitmap: optimize for 128-bytes cache line targets
existing rte_bitmap library implementation optimally configured to run on 64-bytes cache line, extending to 128-bytes cache line targets. Signed-off-by: Jerin Jacob --- lib/librte_sched/rte_bitmap.h | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/librte_sched/rte_bitmap.h b/lib/librte_sched/rte_bitmap.h index 3d911e4..03f332f 100644 --- a/lib/librte_sched/rte_bitmap.h +++ b/lib/librte_sched/rte_bitmap.h @@ -45,11 +45,11 @@ extern "C" { * The bitmap component provides a mechanism to manage large arrays of bits * through bit get/set/clear and bit array scan operations. * - * The bitmap scan operation is optimized for 64-bit CPUs using 64-byte cache + * The bitmap scan operation is optimized for 64-bit CPUs using 64/128 byte cache * lines. The bitmap is hierarchically organized using two arrays (array1 and * array2), with each bit in array1 being associated with a full cache line - * (512 bits) of bitmap bits, which are stored in array2: the bit in array1 is - * set only when there is at least one bit set within its associated array2 + * (512/1024 bits) of bitmap bits, which are stored in array2: the bit in array1 + * is set only when there is at least one bit set within its associated array2 * bits, otherwise the bit in array1 is cleared. The read and write operations * for array1 and array2 are always done in slabs of 64 bits. * @@ -81,11 +81,11 @@ extern "C" { /* Cache line (CL) */ #define RTE_BITMAP_CL_BIT_SIZE (RTE_CACHE_LINE_SIZE * 8) -#define RTE_BITMAP_CL_BIT_SIZE_LOG2 9 +#define RTE_BITMAP_CL_BIT_SIZE_LOG2 (RTE_CACHE_LINE_SIZE_LOG2 + 3) #define RTE_BITMAP_CL_BIT_MASK (RTE_BITMAP_CL_BIT_SIZE - 1) #define RTE_BITMAP_CL_SLAB_SIZE (RTE_BITMAP_CL_BIT_SIZE / RTE_BITMAP_SLAB_BIT_SIZE) -#define RTE_BITMAP_CL_SLAB_SIZE_LOG2 3 +#define RTE_BITMAP_CL_SLAB_SIZE_LOG2 (RTE_BITMAP_CL_BIT_SIZE_LOG2 - RTE_BITMAP_SLAB_BIT_SIZE_LOG2) #define RTE_BITMAP_CL_SLAB_MASK (RTE_BITMAP_CL_SLAB_SIZE - 1) /** Bitmap data structure */ -- 2.1.0
[dpdk-dev] [PATCH v4 4/4] cache/slow-path: reduce cache align requirement for 128-byte cache targets
slow-path data structures need not be 128-byte cache aligned. Reduce the alignment to 64-byte to save the memory. No behavior change for 64-byte cache aligned systems as minimum cache line size as 64. Signed-off-by: Jerin Jacob Acked-by: Konstantin Ananyev --- lib/librte_ether/rte_ethdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h index 8710dd7..16da821 100644 --- a/lib/librte_ether/rte_ethdev.h +++ b/lib/librte_ether/rte_ethdev.h @@ -863,7 +863,7 @@ struct rte_eth_rxq_info { struct rte_eth_rxconf conf; /**< queue config parameters. */ uint8_t scattered_rx; /**< scattered packets RX supported. */ uint16_t nb_desc; /**< configured number of RXDs. */ -} __rte_cache_aligned; +} __rte_cache_min_aligned; /** * Ethernet device TX queue information structure. @@ -872,7 +872,7 @@ struct rte_eth_rxq_info { struct rte_eth_txq_info { struct rte_eth_txconf conf; /**< queue config parameters. */ uint16_t nb_desc; /**< configured number of TXDs. */ -} __rte_cache_aligned; +} __rte_cache_min_aligned; /** Maximum name length for extended statistics counters */ #define RTE_ETH_XSTATS_NAME_SIZE 64 -- 2.1.0
[dpdk-dev] [PATCH] ixgbe: fix x550 VF tx issue
On x550, the basic tx function may not work if we use DPDK VF based on kernel PF. The reason is kernel PF enables a new feature "malicious driver detection". According to this feature, driver should create TX context descriptor and set the "Check Context" bit in advanced data descriptor. This patch add the support of "malicious driver detection" to let kernel PF based DPDK VF work. Although CC bit should be set in IOV mode, as tested, it's no harm to set CC bit in non-IVO mode. So, as PF and VF share the same code, will set CC bit anyway. Please aware there's another way, disabling MDD in PF kernel driver. Like this, >insmod ixgbe.ko MDD=0,0 Signed-off-by: Wenzhuo Lu --- doc/guides/rel_notes/release_2_3.rst | 12 drivers/net/ixgbe/ixgbe_rxtx.c | 16 ++-- drivers/net/ixgbe/ixgbe_rxtx.h | 3 ++- lib/librte_ether/rte_ethdev.h| 1 + lib/librte_mbuf/rte_mbuf.h | 3 ++- 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/guides/rel_notes/release_2_3.rst b/doc/guides/rel_notes/release_2_3.rst index 99de186..90f81e0 100644 --- a/doc/guides/rel_notes/release_2_3.rst +++ b/doc/guides/rel_notes/release_2_3.rst @@ -15,6 +15,18 @@ EAL Drivers ~~~ +* **ixgbe: Fixed x550 VF tx issue.** + + On x550, the basic tx function may not work if we use DPDK VF based on + kernel PF. The reason is kernel PF enables a new feature "malicious driver + detection". According to this feature, driver should create TX context + descriptor and set the "Check Context" bit in advanced data descriptor. + This patch add the support of "malicious driver detection" to let kernel PF + based DPDK VF work. + + Please aware there's another way, disabling MDD in PF kernel driver. + Like this, + >insmod ixgbe.ko MDD=0,0 Libraries ~ diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c index 52a263c..c8a7740 100644 --- a/drivers/net/ixgbe/ixgbe_rxtx.c +++ b/drivers/net/ixgbe/ixgbe_rxtx.c @@ -85,7 +85,8 @@ PKT_TX_VLAN_PKT |\ PKT_TX_IP_CKSUM |\ PKT_TX_L4_MASK | \ - PKT_TX_TCP_SEG) + PKT_TX_TCP_SEG | \ + PKT_TX_MALICIOUS_DRIVER_DETECT) static inline struct rte_mbuf * rte_rxmbuf_alloc(struct rte_mempool *mp) @@ -564,6 +565,8 @@ ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq) return (0); } +#define DEFAULT_CTX_L2_LEN 14 + uint16_t ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) @@ -614,11 +617,19 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, * are needed for offload functionality. */ ol_flags = tx_pkt->ol_flags; + if (!(txq->txq_flags & ETH_TXQ_FLAGS_NOMDD)) + ol_flags |= PKT_TX_MALICIOUS_DRIVER_DETECT; /* If hardware offload required */ tx_ol_req = ol_flags & IXGBE_TX_OFFLOAD_MASK; if (tx_ol_req) { - tx_offload.l2_len = tx_pkt->l2_len; + /* if l2 len isn't provided by the caller, give it a +* default value. +*/ + if (tx_pkt->l2_len == 0) + tx_offload.l2_len = DEFAULT_CTX_L2_LEN; + else + tx_offload.l2_len = tx_pkt->l2_len; tx_offload.l3_len = tx_pkt->l3_len; tx_offload.l4_len = tx_pkt->l4_len; tx_offload.vlan_tci = tx_pkt->vlan_tci; @@ -801,6 +812,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, } olinfo_status |= (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT); + olinfo_status |= IXGBE_ADVTXD_CC; m_seg = tx_pkt; do { diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h index 475a800..bcde785 100644 --- a/drivers/net/ixgbe/ixgbe_rxtx.h +++ b/drivers/net/ixgbe/ixgbe_rxtx.h @@ -255,7 +255,8 @@ struct ixgbe_txq_ops { * RTE_PMD_IXGBE_TX_MAX_BURST. */ #define IXGBE_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \ - ETH_TXQ_FLAGS_NOOFFLOADS) + ETH_TXQ_FLAGS_NOOFFLOADS | \ + ETH_TXQ_FLAGS_NOMDD) /* * Populate descriptors with the following info: diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h index bada8ad..a2e32d0 100644 --- a/lib/librte_ether/rte_ethdev.h +++ b/lib/librte_ether/rte_ethdev.h @@ -637,6 +637,7 @@ struct rte_eth_rxconf { #define ETH_TXQ_FLAGS_NOXSUMSCTP 0x0200 /**< disable SCTP checksum offload */ #define ETH_TXQ_FLAGS_NOXSUMUDP 0x0400 /**< disable UDP checksum offload */ #define ETH_TXQ_FLAGS_NOXSUMTCP 0x0800 /**< disable TCP checksum offload */ +#define ETH_TXQ_FLAGS_NOMDD
[dpdk-dev] [PATCH v2] ixgbe: Fix disable interrupt twice
Hi Michael, > -Original Message- > From: Qiu, Michael > Sent: Friday, January 29, 2016 1:58 PM > To: dev at dpdk.org > Cc: Zhou, Danny; Liu, Yong; Liang, Cunming; Lu, Wenzhuo; Qiu, Michael > Subject: [PATCH v2] ixgbe: Fix disable interrupt twice > > Currently, ixgbe vf and pf will disable interrupt twice in stop stage and > uninit > stage. It will cause an error: > > testpmd> quit > > Shutting down port 0... > Stopping ports... > Done > Closing ports... > EAL: Error disabling MSI-X interrupts for fd 26 > Done > > Becasue the interrupt already been disabled in stop stage. > Since it is enabled in init stage, better remove from stop stage. I'm afraid it's not a good idea to just remove the intr_disable from dev_stop. I think dev_stop have the chance to be used independently with dev_unint. In this scenario, we still need intr_disable, right? Maybe what we need is some check before we disable the intr:)
[dpdk-dev] [PATCH v2 2/2] i40evf: support interrupt based pf reset request
> -Original Message- > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Jingjing Wu > Sent: Wednesday, January 27, 2016 9:50 AM > To: dev at dpdk.org > Subject: [dpdk-dev] [PATCH v2 2/2] i40evf: support interrupt based pf reset > request > > Interrupt based request of PF reset from PF is supported by enabling the > adminq event process in VF driver. > Users can register a callback for this interrupt event to get informed, when a > PF reset request detected like: > rte_eth_dev_callback_register(portid, > RTE_ETH_EVENT_INTR_RESET, > reset_event_callback, > arg); > > Signed-off-by: Jingjing Wu Two questions, 1.If the VF RX/TX using msix 1 and Admin Queue using msix 0, how the two interrupts can both be read in user spaces, in VM VFIO not supported, 2.But if we want to run l3fwd-power in VF, we can only assign the rx/tx intr to msix0, but both thread will using epoll to wait the msix0 event, and intr thread may miss the vf reset event if l3fwd-power thread clean the msix0 related fd firstly if there are no more packets and msg response come in, the intr thread will not be wake up again
[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment
On Thu, Jan 21, 2016 at 08:07:58PM +0900, Tetsuya Mukawa wrote: > +static int > +virt_read_pci_cfg(struct virtio_hw *hw, void *buf, size_t len, off_t offset) > +{ > + qtest_read_pci_cfg(hw, "virtio-net", buf, len, offset); > + return 0; > +} > + > +static void * > +virt_get_mapped_addr(struct virtio_hw *hw, uint8_t bar, > + uint32_t offset, uint32_t length) > +{ > + uint64_t base; > + uint64_t size; > + > + if (qtest_get_bar_size(hw, "virtio-net", bar, &size) < 0) { > + PMD_INIT_LOG(ERR, "invalid bar: %u", bar); > + return NULL; > + } > + > + if (offset + length < offset) { > + PMD_INIT_LOG(ERR, "offset(%u) + lenght(%u) overflows", > + offset, length); > + return NULL; > + } > + > + if (offset + length > size) { > + PMD_INIT_LOG(ERR, > + "invalid cap: overflows bar space: %u > %"PRIu64, > + offset + length, size); > + return NULL; > + } > + > + if (qtest_get_bar_addr(hw, "virtio-net", bar, &base) < 0) { > + PMD_INIT_LOG(ERR, "invalid bar: %u", bar); > + return NULL; > + } So, I understood the usage now, and the cfg_ops abstraction doesn't look good yet necessary to me. For EAL managed pci device, bar length and addr are stored at memory_resources[], and for your case, it's from the qtest. And judging that it's compile time decision, I'd like it to be: #ifdef /* RTE_LIBRTE_VIRTIO_HOST_MODE */ static uint32_t get_bar_size(...) { return qtest_get_bar_size(..); } static uint64-t get_bar_addr(...) { return qtest_get_bar_addr(..); } ... ... #else static uint32_t get_bar_size(...) { return dev->mem_resource[bar].addr; } ... } #endif And then you just need do related changes at virtio_read_caps() and get_cfg_addr(). That'd be much simpler, without introducing duplicate code and uncessary complex. What do you think of that? --yliu > + > + return (void *)(base + offset); > +} > + > +static const struct virtio_pci_cfg_ops virt_cfg_ops = { > + .map= virt_map_pci_cfg, > + .unmap = virt_unmap_pci_cfg, > + .get_mapped_addr= virt_get_mapped_addr, > + .read = virt_read_pci_cfg, > +}; > +#endif /* RTE_LIBRTE_VIRTIO_HOST_MODE */
[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment
On 1/28/2016 10:44 AM, Tetsuya Mukawa wrote: > On 2016/01/27 19:03, Xie, Huawei wrote: >> On 1/21/2016 7:09 PM, Tetsuya Mukawa wrote: >>> + /* Set BAR region */ >>> + for (i = 0; i < NB_BAR; i++) { >>> + switch (dev->bar[i].type) { >>> + case QTEST_PCI_BAR_IO: >>> + case QTEST_PCI_BAR_MEMORY_UNDER_1MB: >>> + case QTEST_PCI_BAR_MEMORY_32: >>> + qtest_pci_outl(s, bus, device, 0, dev->bar[i].addr, >>> + dev->bar[i].region_start); >>> + PMD_DRV_LOG(INFO, "Set BAR of %s device: 0x%lx - >>> 0x%lx\n", >>> + dev->name, dev->bar[i].region_start, >>> + dev->bar[i].region_start + >>> dev->bar[i].region_size); >>> + break; >>> + case QTEST_PCI_BAR_MEMORY_64: >>> + qtest_pci_outq(s, bus, device, 0, dev->bar[i].addr, >>> + dev->bar[i].region_start); >>> + PMD_DRV_LOG(INFO, "Set BAR of %s device: 0x%lx - >>> 0x%lx\n", >>> + dev->name, dev->bar[i].region_start, >>> + dev->bar[i].region_start + >>> dev->bar[i].region_size); >>> + break; >> Hasn't the bar resource already been allocated? Is it the app's >> responsibility to allocate the bar resource in qtest mode? The app >> couldn't have that knowledge. > Yes. In qtest mode, the app should register above values. > (Without it, default values are 0) > Usually, this will be done by BIOS or uEFI. But in qtest mode, these > will not be invoked. > So we need to define above values, and also need to enable PCI devices. > > In this release, I just register hard coded values except for one of > ivshmem BAR. > In next release, I will describe memory map in comment. I think ideally this app should do the whole PCI system initialization(and also the north/south bridge) using the DFS algorithm on behalf of the BIOS, to allocate resource for all bridge and devices. Otherwise if QEMU follows the hardware platform's behavior to route MMIO/IO access, if we only allocate resources for part of the devices, the transactions could not be routed correctly. Anyway we shouldn't bother us. It is OK as long as it works. > > Tetsuya > >
[dpdk-dev] [PATCH v5 09/11] virtio_pci: do not parse if interface is vfio-noiommu
2016-01-29 15:34, Yuanhan Liu: > There is another option for that: the git scissors option; you could > check the git format-patch man page for more detailed info (by searching > "scissors" keyword). I'm just not quite sure Thomas like it or not. For simple discussions, patch after scissors may be a good option. In this series, there are too many patches sent alone and it would be good to send the whole series to make things flatter (while keeping the --in-reply-to of course).
[dpdk-dev] [RFC PATCH 5/5] virtio: Extend virtio-net PMD to support container environment
On Fri, Jan 29, 2016 at 04:57:23PM +0800, Yuanhan Liu wrote: > On Thu, Jan 21, 2016 at 08:07:58PM +0900, Tetsuya Mukawa wrote: > > +static int > > +virt_read_pci_cfg(struct virtio_hw *hw, void *buf, size_t len, off_t > > offset) > > +{ > > + qtest_read_pci_cfg(hw, "virtio-net", buf, len, offset); > > + return 0; > > +} > > + > > +static void * > > +virt_get_mapped_addr(struct virtio_hw *hw, uint8_t bar, > > +uint32_t offset, uint32_t length) > > +{ > > + uint64_t base; > > + uint64_t size; > > + > > + if (qtest_get_bar_size(hw, "virtio-net", bar, &size) < 0) { > > + PMD_INIT_LOG(ERR, "invalid bar: %u", bar); > > + return NULL; > > + } > > + > > + if (offset + length < offset) { > > + PMD_INIT_LOG(ERR, "offset(%u) + lenght(%u) overflows", > > + offset, length); > > + return NULL; > > + } > > + > > + if (offset + length > size) { > > + PMD_INIT_LOG(ERR, > > + "invalid cap: overflows bar space: %u > %"PRIu64, > > + offset + length, size); > > + return NULL; > > + } > > + > > + if (qtest_get_bar_addr(hw, "virtio-net", bar, &base) < 0) { > > + PMD_INIT_LOG(ERR, "invalid bar: %u", bar); > > + return NULL; > > + } > > So, I understood the usage now, and the cfg_ops abstraction doesn't look > good yet necessary to me. For EAL managed pci device, bar length and > addr are stored at memory_resources[], and for your case, it's from the > qtest. And judging that it's compile time decision, I'd like it to be: > > #ifdef /* RTE_LIBRTE_VIRTIO_HOST_MODE */ Oops, sorry, I was wrong. Your code could be co-exist with the traditional virtio pmd driver, thus we can't do that. But still, I think dynamic "if ... else ..." should be better: there are just few places (maybe 4: bar_size, bar length, map device, read config) need that. On the other hand, if you really want to do that abstraction, you should go it with more fine granularity, such as the following methods I proposed, instead of the big one: get_cfg_addr(). In that way, we could avoid duplicate code. --yliu > > static uint32_t > get_bar_size(...) > { > return qtest_get_bar_size(..); > } > > static uint64-t > get_bar_addr(...) > { > return qtest_get_bar_addr(..); > } > > ... > ... > > #else > > static uint32_t > get_bar_size(...) > { > return dev->mem_resource[bar].addr; > } > > ... > > } > #endif > > > And then you just need do related changes at virtio_read_caps() and > get_cfg_addr(). That'd be much simpler, without introducing duplicate > code and uncessary complex. > > What do you think of that? > > --yliu > > > + > > + return (void *)(base + offset); > > +} > > + > > +static const struct virtio_pci_cfg_ops virt_cfg_ops = { > > + .map= virt_map_pci_cfg, > > + .unmap = virt_unmap_pci_cfg, > > + .get_mapped_addr= virt_get_mapped_addr, > > + .read = virt_read_pci_cfg, > > +}; > > +#endif /* RTE_LIBRTE_VIRTIO_HOST_MODE */
[dpdk-dev] [PATCH v5 09/11] virtio_pci: do not parse if interface is vfio-noiommu
On Fri, Jan 29, 2016 at 10:02:26AM +0100, Thomas Monjalon wrote: > 2016-01-29 15:34, Yuanhan Liu: > > There is another option for that: the git scissors option; you could > > check the git format-patch man page for more detailed info (by searching > > "scissors" keyword). I'm just not quite sure Thomas like it or not. > > For simple discussions, patch after scissors may be a good option. > > In this series, there are too many patches sent alone and it would > be good to send the whole series to make things flatter (while keeping > the --in-reply-to of course). Agreed. --yliu
[dpdk-dev] [PATCH v2 3/3] virtio: Add a new layer to abstract pci access method
On Thu, Jan 28, 2016 at 06:33:32PM +0900, Tetsuya Mukawa wrote: > This patch addss function pointers to abstract pci access method. > This abstraction layer will be used when virtio-net PMD supports > container extension. > > The below functions abstract how to access to pci configuration space. > > struct virtio_pci_cfg_ops { > int (*map)(...); > void (*unmap)(...); > void *(*get_mapped_addr)(...); > int (*read)(...); > }; > > The pci configuration space has information how to access to virtio > device registers. Basically, there are 2 ways to acccess to the > registers. One is using portio and the other is using mapped memory. > The below functions abstract this access method. One question: is there a way to map PCI memory with Qtest? I'm thinking if we can keep the io_read/write() for Qtest as well, if so, code could be simplified, a lot, IMO. --yliu
[dpdk-dev] [PATCH v5 09/11] virtio_pci: do not parse if interface is vfio-noiommu
On Fri, Jan 29, 2016 at 2:32 PM, Thomas Monjalon wrote: > 2016-01-29 15:34, Yuanhan Liu: >> There is another option for that: the git scissors option; you could >> check the git format-patch man page for more detailed info (by searching >> "scissors" keyword). I'm just not quite sure Thomas like it or not. > I agree and my mistake.. sorry for confusion, > For simple discussions, patch after scissors may be a good option. > > In this series, there are too many patches sent alone and it would > be good to send the whole series to make things flatter (while keeping > the --in-reply-to of course). Sending whole series taging patch version v6 version shortly, Thanks.
[dpdk-dev] [PATCH v4 00/14] Add virtio support for arm/arm64
On Fri, Jan 29, 2016 at 12:57 PM, Xie, Huawei wrote: > On 1/14/2016 9:29 PM, Santosh Shukla wrote: >> Hi, >> >> This v4 patch uses vfio-noiommu-way to access virtio-net pci interface. >> Tested for arm64 thunderX platform. Patch builds for >> x86/i386/arm/armv8/thunderX. Tested with testpmd application. > > Hi Shukla: > I would take two week's leave from tomorrow and will not have enough > bandwidth to review your code. I think yuanhan will do that. Sorry for > the inconvenience. > Btw, when you send a new version, could you add --in-reply-to so that > all your patches are within one thread? Okay, BTW, v4 is old version now and v5 is recent and I guess I recieved review comment from Yuan/Thomas, sending v6.. now.
[dpdk-dev] [PATCH] pci: Add the class_id support in pci probe
On 01/28/2016 11:38 PM, Thomas Monjalon wrote: > 2016-01-13 14:22, Panu Matilainen: >> On 01/13/2016 01:55 PM, Bruce Richardson wrote: >>> On Thu, Dec 31, 2015 at 09:12:14AM -0800, Stephen Hemminger wrote: On Tue, 29 Dec 2015 10:53:26 +0800 Ziye Yang wrote: > This patch is used to add the class_id support > for pci_probe since some devices need the class_info > (class_code, subclass_code, programming_interface) > > Signed-off-by: Ziye Yang Since rte_pci is exposed to application this breaks the ABI. >>> >>> But applications are not going to be defining rte_pci_ids values >>> internally, are >>> they? That is for drivers to use. Is this really an ABI breakage for >>> applications that we >>> need to be concerned about? >> >> There might not be applications using it but drivers are ABI consumers >> too - think of 3rd party drivers and such. > > Drivers are not ABI consumers in the sense that ABI means > Application Binary Interface. > We are talking about drivers interface here. > When establishing the ABI policy we were discussing about applications only. Generally speaking an ABI is an interface between two program (or software if you like) modules, its not specific to "applications". Looking at http://dpdk.org/doc/guides/contributing/versioning.html I see it does only talk about applications, but an ABI consumer can also be another library. A driver calling rte_malloc() is just as much librte_eal ABI consumer as anything else. Now, I understand that drivers use and need interface(s) that applications have no use for or simply cannot use, and those interfaces could be subject to different policies. As an extreme example, the Linux kernel has two isolated ABIs, one is the userland system call interface which is guaranteed to stay forever and the other is kernel module interface, guarantees nothing at all. In DPDK the difference is far muddier than that since all the interfaces live in common, versioned userland DSOs. So if there are two different interfaces following two different policies, it's all the more important to clearly document them. One simple way could be using a different prefix than rte_. > I agree we must allow 3rd party drivers but there is no good reason > to try to upgrade DPDK without upgrading/porting the external drivers. > If someone does not want to release its driver and keep upgrading DPDK, > it is acceptable IMHO to force an upgrade of its driver. Note that I've no particular sympathy for 3rd party drivers as such. What I *do* care about is that breakage is made explicit, as in drivers built for an incompatible version refuse to load at all, instead of silently corrupting memory etc. - Panu -
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
Hi Marc, > -Original Message- > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Marc Sune > Sent: Friday, January 29, 2016 12:42 AM > To: dev at dpdk.org; Lu, Wenzhuo; Zhang, Helin; Harish Patil; Chen, Jing D > Subject: [dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API > > This patch redesigns the API to set the link speed/s configure > for an ethernet port. Specifically: > > - it allows to define a set of advertised speeds for > auto-negociation. > - it allows to disable link auto-negociation (single fixed speed). > - default: auto-negociate all supported speeds. > > Other changes: > > * Added utility MACROs ETH_SPEED_NUM_XXX with the numeric > values of all supported link speeds, in Mbps. > * Converted link_speed to uint32_t to accomodate 100G speeds > (bug). > * Added autoneg flag in struct rte_eth_link to indicate if > link speed was a result of auto-negociation or was fixed > by configuration. > * Added utility function to convert numeric speeds to bitmap > fields. Can you avoid modifications in the e1000/base code? We do not modify (and maintain) that part on our own. Instead we take it straight from Intel ND. So if you feel like these changes are really necessary - please submit a patch to ND first, and if your changes will be applied, will pick it up from them. Thanks Konstantin > > Signed-off-by: Marc Sune > --- > app/test-pmd/cmdline.c | 124 > +++-- > app/test/virtual_pmd.c | 4 +- > drivers/net/af_packet/rte_eth_af_packet.c | 5 +- > drivers/net/bonding/rte_eth_bond_8023ad.c | 14 ++-- > drivers/net/cxgbe/base/t4_hw.c | 8 +- > drivers/net/e1000/base/e1000_80003es2lan.c | 6 +- > drivers/net/e1000/base/e1000_82541.c | 8 +- > drivers/net/e1000/base/e1000_82543.c | 4 +- > drivers/net/e1000/base/e1000_82575.c | 11 +-- > drivers/net/e1000/base/e1000_api.c | 2 +- > drivers/net/e1000/base/e1000_api.h | 2 +- > drivers/net/e1000/base/e1000_defines.h | 4 +- > drivers/net/e1000/base/e1000_hw.h | 2 +- > drivers/net/e1000/base/e1000_ich8lan.c | 7 +- > drivers/net/e1000/base/e1000_mac.c | 9 ++- > drivers/net/e1000/base/e1000_mac.h | 6 +- > drivers/net/e1000/base/e1000_vf.c | 4 +- > drivers/net/e1000/base/e1000_vf.h | 2 +- > drivers/net/e1000/em_ethdev.c | 113 +- > drivers/net/e1000/igb_ethdev.c | 108 + > drivers/net/fm10k/fm10k_ethdev.c | 8 +- > drivers/net/i40e/i40e_ethdev.c | 73 - > drivers/net/i40e/i40e_ethdev_vf.c | 11 +-- > drivers/net/ixgbe/ixgbe_ethdev.c | 78 -- > drivers/net/mlx4/mlx4.c| 2 + > drivers/net/mpipe/mpipe_tilegx.c | 6 +- > drivers/net/null/rte_eth_null.c| 5 +- > drivers/net/pcap/rte_eth_pcap.c| 9 ++- > drivers/net/ring/rte_eth_ring.c| 5 +- > drivers/net/virtio/virtio_ethdev.c | 2 +- > drivers/net/virtio/virtio_ethdev.h | 2 - > drivers/net/vmxnet3/vmxnet3_ethdev.c | 5 +- > drivers/net/xenvirt/rte_eth_xenvirt.c | 5 +- > examples/ip_pipeline/config_parse.c| 3 +- > lib/librte_ether/rte_ethdev.c | 49 > lib/librte_ether/rte_ethdev.h | 113 -- > 36 files changed, 454 insertions(+), 365 deletions(-) > > diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c > index 6d28c1b..00571bc 100644 > --- a/app/test-pmd/cmdline.c > +++ b/app/test-pmd/cmdline.c > @@ -956,14 +956,65 @@ struct cmd_config_speed_all { > cmdline_fixed_string_t value2; > }; > > +static int > +parse_and_check_speed_duplex(char *value1, char *value2, uint32_t > *link_speed) > +{ > + > + int duplex; > + > + if (!strcmp(value2, "half")) { > + duplex = 0; > + } else if (!strcmp(value2, "full")) { > + duplex = 1; > + } else if (!strcmp(value2, "auto")) { > + duplex = 1; > + } else { > + printf("Unknown parameter\n"); > + return -1; > + } > + > + if (!strcmp(value1, "10")) { > + *link_speed = (duplex) ? ETH_LINK_SPEED_10M : > + ETH_LINK_SPEED_10M_HD; > + } else if (!strcmp(value1, "100")) { > + *link_speed = (duplex) ? ETH_LINK_SPEED_100M : > + ETH_LINK_SPEED_100M_HD; > + } else if (!strcmp(value1, "1000")) { > + if (!duplex) > + goto invalid_speed_param; > + *link_speed = ETH_LINK_SPEED_1G; > + } else if (!strcmp(value1, "1")) { > + if (!duplex) > + goto invalid_speed_param; > + *link_speed = ETH_LINK
[dpdk-dev] [PATCH] pci: Add the class_id support in pci probe
2016-01-29 11:21, Panu Matilainen: > On 01/28/2016 11:38 PM, Thomas Monjalon wrote: > > 2016-01-13 14:22, Panu Matilainen: > >> On 01/13/2016 01:55 PM, Bruce Richardson wrote: > >>> On Thu, Dec 31, 2015 at 09:12:14AM -0800, Stephen Hemminger wrote: > On Tue, 29 Dec 2015 10:53:26 +0800 > Ziye Yang wrote: > > > This patch is used to add the class_id support > > for pci_probe since some devices need the class_info > > (class_code, subclass_code, programming_interface) > > > > Signed-off-by: Ziye Yang > > Since rte_pci is exposed to application this breaks the ABI. > >>> > >>> But applications are not going to be defining rte_pci_ids values > >>> internally, are > >>> they? That is for drivers to use. Is this really an ABI breakage for > >>> applications that we > >>> need to be concerned about? > >> > >> There might not be applications using it but drivers are ABI consumers > >> too - think of 3rd party drivers and such. > > > > Drivers are not ABI consumers in the sense that ABI means > > Application Binary Interface. > > We are talking about drivers interface here. > > When establishing the ABI policy we were discussing about applications only. > > Generally speaking an ABI is an interface between two program (or > software if you like) modules, its not specific to "applications". > Looking at http://dpdk.org/doc/guides/contributing/versioning.html I see > it does only talk about applications, but an ABI consumer can also be > another library. A driver calling rte_malloc() is just as much > librte_eal ABI consumer as anything else. > > Now, I understand that drivers use and need interface(s) that > applications have no use for or simply cannot use, and those interfaces > could be subject to different policies. As an extreme example, the Linux > kernel has two isolated ABIs, one is the userland system call interface > which is guaranteed to stay forever and the other is kernel module > interface, guarantees nothing at all. > > In DPDK the difference is far muddier than that since all the interfaces > live in common, versioned userland DSOs. So if there are two different > interfaces following two different policies, it's all the more important > to clearly document them. One simple way could be using a different > prefix than rte_. Good suggestion. Or we can simply have different files with a clear notice in their headers and in the versioning doc. It was well split in rte_cryptodev_pmd.h > > I agree we must allow 3rd party drivers but there is no good reason > > to try to upgrade DPDK without upgrading/porting the external drivers. > > If someone does not want to release its driver and keep upgrading DPDK, > > it is acceptable IMHO to force an upgrade of its driver. > > Note that I've no particular sympathy for 3rd party drivers as such. > What I *do* care about is that breakage is made explicit, as in drivers > built for an incompatible version refuse to load at all, instead of > silently corrupting memory etc. OK I agree. Anyway the ABI versionning does not cover the structure changes. What about making a DPDK version check when registering a driver? So a binary driver would be clearly bound to a DPDK version. And we should change or remove the .so version which never change for most of drivers.
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
2016-01-29 09:24, Ananyev, Konstantin: > Can you avoid modifications in the e1000/base code? > We do not modify (and maintain) that part on our own. > Instead we take it straight from Intel ND. > So if you feel like these changes are really necessary - please submit a patch > to ND first, and if your changes will be applied, will pick it up from them. I was not aware we can submit a change to ND for Intel base drivers. What is the procedure please? May it be documented in the DPDK doc? Thanks
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
Hi Thomas, > -Original Message- > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > Sent: Friday, January 29, 2016 9:38 AM > To: Ananyev, Konstantin > Cc: dev at dpdk.org; Marc Sune; Lu, Wenzhuo; Zhang, Helin; Harish Patil; > Chen, Jing D; Mcnamara, John > Subject: Re: [dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API > > 2016-01-29 09:24, Ananyev, Konstantin: > > Can you avoid modifications in the e1000/base code? > > We do not modify (and maintain) that part on our own. > > Instead we take it straight from Intel ND. > > So if you feel like these changes are really necessary - please submit a > > patch > > to ND first, and if your changes will be applied, will pick it up from them. > > I was not aware we can submit a change to ND for Intel base drivers. > What is the procedure please? I meant not to the ND directly, but probably to the freebsd e1000 kernel driver. As I remember, that is the closest one to what we have. >From my understanding (I might be wrong here): If they will be accepted, we should see these changes In next code drops from ND. Konstantin > May it be documented in the DPDK doc? > > Thanks
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
2016-01-29 09:47, Ananyev, Konstantin: > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > > 2016-01-29 09:24, Ananyev, Konstantin: > > > Can you avoid modifications in the e1000/base code? > > > We do not modify (and maintain) that part on our own. > > > Instead we take it straight from Intel ND. > > > So if you feel like these changes are really necessary - please submit a > > > patch > > > to ND first, and if your changes will be applied, will pick it up from > > > them. > > > > I was not aware we can submit a change to ND for Intel base drivers. > > What is the procedure please? > > I meant not to the ND directly, but probably to the freebsd e1000 kernel > driver. > As I remember, that is the closest one to what we have. > From my understanding (I might be wrong here): > If they will be accepted, we should see these changes In next code drops from > ND. These base drivers are used in several places. We are allowed to submit a patch in Linux or FreeBSD but not in DPDK where the base driver is verbatim? We have an agreement to not touch them in DPDK but I still think the ND team could consider some patches from dpdk.org.
[dpdk-dev] Errors Rx count increasing while pktgen doing nothing on Intel 82598EB 10G
Let me share one more information about this issue. I tried to run dpdk-2.2.0/examples/ethtool, which showed the same problem. (i.e. I guess other examples will also fail.) [root at centos7 app]# ./ethtool EAL: Detected lcore 0 as core 0 on socket 0 EAL: Detected lcore 1 as core 0 on socket 1 EAL: Detected lcore 2 as core 1 on socket 0 EAL: Detected lcore 3 as core 1 on socket 1 EAL: Detected lcore 4 as core 2 on socket 0 EAL: Detected lcore 5 as core 2 on socket 1 EAL: Detected lcore 6 as core 3 on socket 0 EAL: Detected lcore 7 as core 3 on socket 1 EAL: Detected lcore 8 as core 0 on socket 0 EAL: Detected lcore 9 as core 0 on socket 1 EAL: Detected lcore 10 as core 1 on socket 0 EAL: Detected lcore 11 as core 1 on socket 1 EAL: Detected lcore 12 as core 2 on socket 0 EAL: Detected lcore 13 as core 2 on socket 1 EAL: Detected lcore 14 as core 3 on socket 0 EAL: Detected lcore 15 as core 3 on socket 1 EAL: Support maximum 128 logical core(s) by configuration. EAL: Detected 16 lcore(s) EAL: VFIO modules not all loaded, skip VFIO support... EAL: Setting up physically contiguous memory... EAL: Ask a virtual area of 0x140 bytes EAL: Virtual area found at 0x7f79fa80 (size = 0x140) EAL: Ask a virtual area of 0x7ec0 bytes EAL: Virtual area found at 0x7f797ba0 (size = 0x7ec0) EAL: Ask a virtual area of 0x20 bytes EAL: Virtual area found at 0x7f797b60 (size = 0x20) EAL: Ask a virtual area of 0x7fc0 bytes EAL: Virtual area found at 0x7f78fb80 (size = 0x7fc0) EAL: Ask a virtual area of 0x20 bytes EAL: Virtual area found at 0x7f78fb40 (size = 0x20) EAL: Ask a virtual area of 0x1 bytes EAL: Virtual area found at 0x7f77fb20 (size = 0x1) EAL: Requesting 2048 pages of size 2MB from socket 0 EAL: Requesting 2048 pages of size 2MB from socket 1 EAL: TSC frequency is ~2260984 KHz EAL: Master lcore 0 is ready (tid=fdc578c0;cpuset=[0]) EAL: lcore 6 is ready (tid=f81cb700;cpuset=[6]) EAL: lcore 4 is ready (tid=f91cd700;cpuset=[4]) EAL: lcore 12 is ready (tid=f51c5700;cpuset=[12]) EAL: lcore 9 is ready (tid=f69c8700;cpuset=[9]) EAL: lcore 8 is ready (tid=f71c9700;cpuset=[8]) EAL: lcore 14 is ready (tid=f41c3700;cpuset=[14]) EAL: lcore 3 is ready (tid=f99ce700;cpuset=[3]) EAL: lcore 13 is ready (tid=f49c4700;cpuset=[13]) EAL: lcore 7 is ready (tid=f79ca700;cpuset=[7]) EAL: lcore 10 is ready (tid=f61c7700;cpuset=[10]) EAL: lcore 11 is ready (tid=f59c6700;cpuset=[11]) EAL: lcore 15 is ready (tid=f39c2700;cpuset=[15]) EAL: lcore 5 is ready (tid=f89cc700;cpuset=[5]) EAL: lcore 2 is ready (tid=fa1cf700;cpuset=[2]) EAL: lcore 1 is ready (tid=fc562700;cpuset=[1]) EAL: PCI device :06:00.0 on NUMA socket -1 EAL: probe driver: 8086:10e8 rte_igb_pmd EAL: Not managed by a supported kernel driver, skipped EAL: PCI device :06:00.1 on NUMA socket -1 EAL: probe driver: 8086:10e8 rte_igb_pmd EAL: Not managed by a supported kernel driver, skipped EAL: PCI device :07:00.0 on NUMA socket -1 EAL: probe driver: 8086:10e8 rte_igb_pmd EAL: Not managed by a supported kernel driver, skipped EAL: PCI device :07:00.1 on NUMA socket -1 EAL: probe driver: 8086:10e8 rte_igb_pmd EAL: Not managed by a supported kernel driver, skipped EAL: PCI device :09:00.0 on NUMA socket -1 EAL: probe driver: 8086:10c7 rte_ixgbe_pmd EAL: PCI memory mapped at 0x7f79fbc0 EAL: PCI memory mapped at 0x7f79fbc2 EAL: PCI memory mapped at 0x7f79fbc6 PMD: eth_ixgbe_dev_init(): MAC: 1, PHY: 8 PMD: eth_ixgbe_dev_init(): port 0 vendorID=0x8086 deviceID=0x10c7 EAL: PCI device :0a:00.0 on NUMA socket -1 EAL: probe driver: 8086:10c7 rte_ixgbe_pmd EAL: PCI memory mapped at 0x7f79fbc64000 EAL: PCI memory mapped at 0x7f79fbc84000 EAL: PCI memory mapped at 0x7f79fbcc4000 PMD: eth_ixgbe_dev_init(): MAC: 1, PHY: 8 PMD: eth_ixgbe_dev_init(): port 1 vendorID=0x8086 deviceID=0x10c7 Number of NICs: 2 Init port 0.. PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7f78f99d7140 sw_sc_ring=0x7f78f99d6c00 hw_ring=0x7f78f99d7680 dma_addr=0x4207d7680 PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f78f99c5a40 hw_ring=0x7f78f99c6a80 dma_addr=0x4207c6a80 PMD: ixgbe_set_tx_function(): Using simple tx code path PMD: ixgbe_set_tx_function(): Vector tx enabled. PMD: ixgbe_set_rx_function(): Vector rx enabled, please make sure RX burst size no less than 4 (port=0). PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0 Init port 1.. PMD: ixgbe_dev_rx_queue_setup(): sw_ring=0x7f78f846ad40 sw_sc_ring=0x7f78f846a800 hw_ring=0x7f78f846b280 dma_addr=0x41f26b280 PMD: ixgbe_dev_tx_queue_setup(): sw_ring=0x7f78f8459640 hw_ring=0x7f78f845a680 dma_addr=0x41f25a680 PMD: ixgbe_set_tx_function(): Using simple tx code path PMD: ixgbe_set_tx_function(): Vector tx enabled. PMD: ixgbe_set_rx_function(): Vector rx enabled, please make sure RX burst size no less than 4 (port=1). PMD: ixgbe_dev_rx_queue_start(): Could not enable Rx Queue 0 EthApp> pause [UINT16]: pause Print port pau
[dpdk-dev] [PATCH] pci: Add the class_id support in pci probe
On 01/29/2016 11:34 AM, Thomas Monjalon wrote: > 2016-01-29 11:21, Panu Matilainen: >> On 01/28/2016 11:38 PM, Thomas Monjalon wrote: >>> 2016-01-13 14:22, Panu Matilainen: On 01/13/2016 01:55 PM, Bruce Richardson wrote: > On Thu, Dec 31, 2015 at 09:12:14AM -0800, Stephen Hemminger wrote: >> On Tue, 29 Dec 2015 10:53:26 +0800 >> Ziye Yang wrote: >> >>> This patch is used to add the class_id support >>> for pci_probe since some devices need the class_info >>> (class_code, subclass_code, programming_interface) >>> >>> Signed-off-by: Ziye Yang >> >> Since rte_pci is exposed to application this breaks the ABI. > > But applications are not going to be defining rte_pci_ids values > internally, are > they? That is for drivers to use. Is this really an ABI breakage for > applications that we > need to be concerned about? There might not be applications using it but drivers are ABI consumers too - think of 3rd party drivers and such. >>> >>> Drivers are not ABI consumers in the sense that ABI means >>> Application Binary Interface. >>> We are talking about drivers interface here. >>> When establishing the ABI policy we were discussing about applications only. >> >> Generally speaking an ABI is an interface between two program (or >> software if you like) modules, its not specific to "applications". >> Looking at http://dpdk.org/doc/guides/contributing/versioning.html I see >> it does only talk about applications, but an ABI consumer can also be >> another library. A driver calling rte_malloc() is just as much >> librte_eal ABI consumer as anything else. >> >> Now, I understand that drivers use and need interface(s) that >> applications have no use for or simply cannot use, and those interfaces >> could be subject to different policies. As an extreme example, the Linux >> kernel has two isolated ABIs, one is the userland system call interface >> which is guaranteed to stay forever and the other is kernel module >> interface, guarantees nothing at all. >> >> In DPDK the difference is far muddier than that since all the interfaces >> live in common, versioned userland DSOs. So if there are two different >> interfaces following two different policies, it's all the more important >> to clearly document them. One simple way could be using a different >> prefix than rte_. > > Good suggestion. Or we can simply have different files with a clear notice > in their headers and in the versioning doc. > It was well split in rte_cryptodev_pmd.h Using separate headers is also good. Optimally both? :) >>> I agree we must allow 3rd party drivers but there is no good reason >>> to try to upgrade DPDK without upgrading/porting the external drivers. >>> If someone does not want to release its driver and keep upgrading DPDK, >>> it is acceptable IMHO to force an upgrade of its driver. >> >> Note that I've no particular sympathy for 3rd party drivers as such. >> What I *do* care about is that breakage is made explicit, as in drivers >> built for an incompatible version refuse to load at all, instead of >> silently corrupting memory etc. > > OK I agree. Cool, the rest is just details then. > Anyway the ABI versionning does not cover the structure changes. > What about making a DPDK version check when registering a driver? > So a binary driver would be clearly bound to a DPDK version. That's one possibility. Another way to achieve essentially the same is to make rte_eal_driver_register() symbol version follow the DPDK version, in which case a driver built for another version will fail at dlopen() already. > And we should change or remove the .so version which never change for > most of drivers. Yup, so-versioning DSOs which do not offer any ABI is kinda pointless. - Panu -
[dpdk-dev] [PATCH] doc: minor correction in document
* remove outdated chapter reference to Multi-process support Fixes: fc1f2750a3ec ("doc: programmers guide") * html output converts "--" to "-", this is wrong when explaining the command arguments, used "option list" to fix this: http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#option-lists Signed-off-by: Ferruh Yigit --- doc/guides/prog_guide/env_abstraction_layer.rst | 2 +- doc/guides/prog_guide/multi_proc_support.rst| 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst index 89feb69..4737dc2 100644 --- a/doc/guides/prog_guide/env_abstraction_layer.rst +++ b/doc/guides/prog_guide/env_abstraction_layer.rst @@ -103,7 +103,7 @@ Multi-process Support ~ The Linuxapp EAL allows a multi-process as well as a multi-threaded (pthread) deployment model. -See chapter 2.20 +See chapter :ref:`Multi-process Support ` for more details. Memory Mapping Discovery and Memory Reservation diff --git a/doc/guides/prog_guide/multi_proc_support.rst b/doc/guides/prog_guide/multi_proc_support.rst index 6562f0d..1478a13 100644 --- a/doc/guides/prog_guide/multi_proc_support.rst +++ b/doc/guides/prog_guide/multi_proc_support.rst @@ -55,9 +55,8 @@ after a primary process has already configured the hugepage shared memory for th To support these two process types, and other multi-process setups described later, two additional command-line parameters are available to the EAL: -* --proc-type: for specifying a given process instance as the primary or secondary DPDK instance - -* --file-prefix: to allow processes that do not want to co-operate to have different memory regions + --proc-typefor specifying a given process instance as the primary or secondary DPDK instance + --file-prefix to allow processes that do not want to co-operate to have different memory regions A number of example applications are provided that demonstrate how multiple DPDK processes can be used together. These are more fully documented in the "Multi- process Sample Application" chapter -- 2.5.0
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
> -Original Message- > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > Sent: Friday, January 29, 2016 9:54 AM > To: Ananyev, Konstantin > Cc: dev at dpdk.org; Marc Sune; Lu, Wenzhuo; Zhang, Helin; Harish Patil; > Chen, Jing D; Mcnamara, John > Subject: Re: [dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API > > 2016-01-29 09:47, Ananyev, Konstantin: > > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > > > 2016-01-29 09:24, Ananyev, Konstantin: > > > > Can you avoid modifications in the e1000/base code? > > > > We do not modify (and maintain) that part on our own. > > > > Instead we take it straight from Intel ND. > > > > So if you feel like these changes are really necessary - please submit > > > > a patch > > > > to ND first, and if your changes will be applied, will pick it up from > > > > them. > > > > > > I was not aware we can submit a change to ND for Intel base drivers. > > > What is the procedure please? > > > > I meant not to the ND directly, but probably to the freebsd e1000 kernel > > driver. > > As I remember, that is the closest one to what we have. > > From my understanding (I might be wrong here): > > If they will be accepted, we should see these changes In next code drops > > from ND. > > These base drivers are used in several places. > We are allowed to submit a patch in Linux or FreeBSD but not in DPDK > where the base driver is verbatim? Yes, that's my understanding. > We have an agreement to not touch them in DPDK Yes. > but I still think the > ND team could consider some patches from dpdk.org. I personally think that would be a good thing, but it is up to ND guys to make such decision. Konstantin
[dpdk-dev] [PATCH] mlx5: fix header generation in parallel builds
Fixes: 771fa900b73a ("mlx5: introduce new driver for Mellanox ConnectX-4 adapters") Signed-off-by: Adrien Mazarguil --- drivers/net/mlx5/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index ae568e6..698f072 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -126,7 +126,7 @@ mlx5_autoconf.h: $(RTE_SDK)/scripts/auto-config-h.sh enum IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \ $(AUTOCONF_OUTPUT) -mlx5.o: mlx5_autoconf.h +$(SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD):.c=.o): mlx5_autoconf.h clean_mlx5: FORCE $Q rm -f -- mlx5_autoconf.h -- 2.1.4
[dpdk-dev] [PATCH 0/5] Add flow director and RX VLAN stripping support
To preserve compatibility with Mellanox OFED 3.1, flow director and RX VLAN stripping code is only enabled if compiled with 3.2. Yaacov Hazan (5): mlx5: refactor special flows handling mlx5: add special flows (broadcast and IPv6 multicast) mlx5: make flow steering rule generator more generic mlx5: add support for flow director mlx5: add support for RX VLAN stripping drivers/net/mlx5/Makefile | 6 + drivers/net/mlx5/mlx5.c | 39 +- drivers/net/mlx5/mlx5.h | 19 +- drivers/net/mlx5/mlx5_defs.h| 14 + drivers/net/mlx5/mlx5_ethdev.c | 3 +- drivers/net/mlx5/mlx5_fdir.c| 890 drivers/net/mlx5/mlx5_mac.c | 10 +- drivers/net/mlx5/mlx5_rxmode.c | 350 drivers/net/mlx5/mlx5_rxq.c | 80 +++- drivers/net/mlx5/mlx5_rxtx.c| 27 ++ drivers/net/mlx5/mlx5_rxtx.h| 51 ++- drivers/net/mlx5/mlx5_trigger.c | 21 +- drivers/net/mlx5/mlx5_vlan.c| 104 + 13 files changed, 1388 insertions(+), 226 deletions(-) create mode 100644 drivers/net/mlx5/mlx5_fdir.c -- 2.1.4
[dpdk-dev] [PATCH 1/5] mlx5: refactor special flows handling
From: Yaacov Hazan Merge redundant code by adding a static initialization table to manage promiscuous and allmulticast (special) flows. New function priv_rehash_flows() implements the logic to enable/disable relevant flows in one place from any context. Signed-off-by: Yaacov Hazan Signed-off-by: Adrien Mazarguil --- drivers/net/mlx5/mlx5.c | 4 +- drivers/net/mlx5/mlx5.h | 6 +- drivers/net/mlx5/mlx5_defs.h| 3 + drivers/net/mlx5/mlx5_rxmode.c | 321 ++-- drivers/net/mlx5/mlx5_rxq.c | 33 - drivers/net/mlx5/mlx5_rxtx.h| 29 +++- drivers/net/mlx5/mlx5_trigger.c | 14 +- 7 files changed, 210 insertions(+), 200 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 821ee0f..4180842 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -88,8 +88,8 @@ mlx5_dev_close(struct rte_eth_dev *dev) ((priv->ctx != NULL) ? priv->ctx->device->name : "")); /* In case mlx5_dev_stop() has not been called. */ priv_dev_interrupt_handler_uninstall(priv, dev); - priv_allmulticast_disable(priv); - priv_promiscuous_disable(priv); + priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_ALLMULTI); + priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_PROMISC); priv_mac_addrs_disable(priv); priv_destroy_hash_rxqs(priv); /* Prevent crashes when queues are still in use. */ diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index b84d31d..bc0c7e2 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -194,13 +194,11 @@ int mlx5_dev_rss_reta_update(struct rte_eth_dev *, /* mlx5_rxmode.c */ -int priv_promiscuous_enable(struct priv *); +int priv_special_flow_enable(struct priv *, enum hash_rxq_flow_type); +void priv_special_flow_disable(struct priv *, enum hash_rxq_flow_type); void mlx5_promiscuous_enable(struct rte_eth_dev *); -void priv_promiscuous_disable(struct priv *); void mlx5_promiscuous_disable(struct rte_eth_dev *); -int priv_allmulticast_enable(struct priv *); void mlx5_allmulticast_enable(struct rte_eth_dev *); -void priv_allmulticast_disable(struct priv *); void mlx5_allmulticast_disable(struct rte_eth_dev *); /* mlx5_stats.c */ diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index ae5eda9..1f2a010 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -43,6 +43,9 @@ /* Maximum number of simultaneous VLAN filters. */ #define MLX5_MAX_VLAN_IDS 128 +/* Maximum number of special flows. */ +#define MLX5_MAX_SPECIAL_FLOWS 2 + /* Request send completion once in every 64 sends, might be less. */ #define MLX5_PMD_TX_PER_COMP_REQ 64 diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index 096fd18..b2ed17e 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -58,31 +58,96 @@ #include "mlx5_rxtx.h" #include "mlx5_utils.h" -static void hash_rxq_promiscuous_disable(struct hash_rxq *); -static void hash_rxq_allmulticast_disable(struct hash_rxq *); +/* Initialization data for special flows. */ +static const struct special_flow_init special_flow_init[] = { + [HASH_RXQ_FLOW_TYPE_PROMISC] = { + .dst_mac_val = "\x00\x00\x00\x00\x00\x00", + .dst_mac_mask = "\x00\x00\x00\x00\x00\x00", + .hash_types = + 1 << HASH_RXQ_TCPV4 | + 1 << HASH_RXQ_UDPV4 | + 1 << HASH_RXQ_IPV4 | +#ifdef HAVE_FLOW_SPEC_IPV6 + 1 << HASH_RXQ_TCPV6 | + 1 << HASH_RXQ_UDPV6 | + 1 << HASH_RXQ_IPV6 | +#endif /* HAVE_FLOW_SPEC_IPV6 */ + 1 << HASH_RXQ_ETH | + 0, + }, + [HASH_RXQ_FLOW_TYPE_ALLMULTI] = { + .dst_mac_val = "\x01\x00\x00\x00\x00\x00", + .dst_mac_mask = "\x01\x00\x00\x00\x00\x00", + .hash_types = + 1 << HASH_RXQ_UDPV4 | + 1 << HASH_RXQ_IPV4 | +#ifdef HAVE_FLOW_SPEC_IPV6 + 1 << HASH_RXQ_UDPV6 | + 1 << HASH_RXQ_IPV6 | +#endif /* HAVE_FLOW_SPEC_IPV6 */ + 1 << HASH_RXQ_ETH | + 0, + }, +}; /** - * Enable promiscuous mode in a hash RX queue. + * Enable a special flow in a hash RX queue. * * @param hash_rxq * Pointer to hash RX queue structure. + * @param flow_type + * Special flow type. * * @return * 0 on success, errno value on failure. */ static int -hash_rxq_promiscuous_enable(struct hash_rxq *hash_rxq) +hash_rxq_special_flow_enable(struct hash_rxq *hash_rxq, +enum hash_rxq_flow_type flow_type) { struct ibv_exp_flow *flow; FLOW_ATTR_SPEC_ETH(data, hash_rxq_flow_attr(hash_rxq, NULL, 0)); struct ibv_exp_flow_attr *attr = &data->attr; + stru
[dpdk-dev] [PATCH 2/5] mlx5: add special flows (broadcast and IPv6 multicast)
From: Yaacov Hazan Until now, broadcast frames were handled like unicast. Moving the related flow to the special flows table frees up the related unicast MAC entry. The same method is used to handle IPv6 multicast frames. Signed-off-by: Yaacov Hazan Signed-off-by: Adrien Mazarguil --- drivers/net/mlx5/mlx5.c | 7 +++ drivers/net/mlx5/mlx5_defs.h| 2 +- drivers/net/mlx5/mlx5_ethdev.c | 3 +-- drivers/net/mlx5/mlx5_mac.c | 6 ++ drivers/net/mlx5/mlx5_rxmode.c | 24 drivers/net/mlx5/mlx5_rxq.c | 10 ++ drivers/net/mlx5/mlx5_rxtx.h| 6 ++ drivers/net/mlx5/mlx5_trigger.c | 4 8 files changed, 51 insertions(+), 11 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 4180842..6afc873 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -90,6 +90,8 @@ mlx5_dev_close(struct rte_eth_dev *dev) priv_dev_interrupt_handler_uninstall(priv, dev); priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_ALLMULTI); priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_PROMISC); + priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_BROADCAST); + priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_IPV6MULTI); priv_mac_addrs_disable(priv); priv_destroy_hash_rxqs(priv); /* Prevent crashes when queues are still in use. */ @@ -415,13 +417,10 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) mac.addr_bytes[0], mac.addr_bytes[1], mac.addr_bytes[2], mac.addr_bytes[3], mac.addr_bytes[4], mac.addr_bytes[5]); - /* Register MAC and broadcast addresses. */ + /* Register MAC address. */ claim_zero(priv_mac_addr_add(priv, 0, (const uint8_t (*)[ETHER_ADDR_LEN]) mac.addr_bytes)); - claim_zero(priv_mac_addr_add(priv, (RTE_DIM(priv->mac) - 1), -&(const uint8_t [ETHER_ADDR_LEN]) -{ "\xff\xff\xff\xff\xff\xff" })); #ifndef NDEBUG { char ifname[IF_NAMESIZE]; diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index 1f2a010..a7440e7 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -44,7 +44,7 @@ #define MLX5_MAX_VLAN_IDS 128 /* Maximum number of special flows. */ -#define MLX5_MAX_SPECIAL_FLOWS 2 +#define MLX5_MAX_SPECIAL_FLOWS 4 /* Request send completion once in every 64 sends, might be less. */ #define MLX5_PMD_TX_PER_COMP_REQ 64 diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 1159fa3..6704382 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -501,8 +501,7 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) max = 65535; info->max_rx_queues = max; info->max_tx_queues = max; - /* Last array entry is reserved for broadcast. */ - info->max_mac_addrs = (RTE_DIM(priv->mac) - 1); + info->max_mac_addrs = RTE_DIM(priv->mac); info->rx_offload_capa = (priv->hw_csum ? (DEV_RX_OFFLOAD_IPV4_CKSUM | diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index e37ce06..1a0b974 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -212,8 +212,7 @@ mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) priv_lock(priv); DEBUG("%p: removing MAC address from index %" PRIu32, (void *)dev, index); - /* Last array entry is reserved for broadcast. */ - if (index >= (RTE_DIM(priv->mac) - 1)) + if (index >= RTE_DIM(priv->mac)) goto end; priv_mac_addr_del(priv, index); end: @@ -479,8 +478,7 @@ mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, priv_lock(priv); DEBUG("%p: adding MAC address at index %" PRIu32, (void *)dev, index); - /* Last array entry is reserved for broadcast. */ - if (index >= (RTE_DIM(priv->mac) - 1)) + if (index >= RTE_DIM(priv->mac)) goto end; priv_mac_addr_add(priv, index, (const uint8_t (*)[ETHER_ADDR_LEN]) diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index b2ed17e..6ee7ce3 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -88,6 +88,30 @@ static const struct special_flow_init special_flow_init[] = { 1 << HASH_RXQ_ETH | 0, }, + [HASH_RXQ_FLOW_TYPE_BROADCAST] = { + .dst_mac_val = "\xff\xff\xff\xff\xff\xff", + .dst_mac_mask = "\xff\xff\xff\xff\xff\xff", + .has
[dpdk-dev] [PATCH 3/5] mlx5: make flow steering rule generator more generic
From: Yaacov Hazan Upcoming flow director support will reuse this function to generate filter rules. Signed-off-by: Yaacov Hazan Signed-off-by: Adrien Mazarguil --- drivers/net/mlx5/mlx5_mac.c| 4 ++-- drivers/net/mlx5/mlx5_rxmode.c | 5 +++-- drivers/net/mlx5/mlx5_rxq.c| 16 drivers/net/mlx5/mlx5_rxtx.h | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index 1a0b974..f952afc 100644 --- a/drivers/net/mlx5/mlx5_mac.c +++ b/drivers/net/mlx5/mlx5_mac.c @@ -241,7 +241,7 @@ hash_rxq_add_mac_flow(struct hash_rxq *hash_rxq, unsigned int mac_index, const uint8_t (*mac)[ETHER_ADDR_LEN] = (const uint8_t (*)[ETHER_ADDR_LEN]) priv->mac[mac_index].addr_bytes; - FLOW_ATTR_SPEC_ETH(data, hash_rxq_flow_attr(hash_rxq, NULL, 0)); + FLOW_ATTR_SPEC_ETH(data, priv_flow_attr(priv, NULL, 0, hash_rxq->type)); struct ibv_exp_flow_attr *attr = &data->attr; struct ibv_exp_flow_spec_eth *spec = &data->spec; unsigned int vlan_enabled = !!priv->vlan_filter_n; @@ -256,7 +256,7 @@ hash_rxq_add_mac_flow(struct hash_rxq *hash_rxq, unsigned int mac_index, * This layout is expected by libibverbs. */ assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec); - hash_rxq_flow_attr(hash_rxq, attr, sizeof(data)); + priv_flow_attr(priv, attr, sizeof(data), hash_rxq->type); /* The first specification must be Ethernet. */ assert(spec->type == IBV_EXP_FLOW_SPEC_ETH); assert(spec->size == sizeof(*spec)); diff --git a/drivers/net/mlx5/mlx5_rxmode.c b/drivers/net/mlx5/mlx5_rxmode.c index 6ee7ce3..9ac7a41 100644 --- a/drivers/net/mlx5/mlx5_rxmode.c +++ b/drivers/net/mlx5/mlx5_rxmode.c @@ -129,8 +129,9 @@ static int hash_rxq_special_flow_enable(struct hash_rxq *hash_rxq, enum hash_rxq_flow_type flow_type) { + struct priv *priv = hash_rxq->priv; struct ibv_exp_flow *flow; - FLOW_ATTR_SPEC_ETH(data, hash_rxq_flow_attr(hash_rxq, NULL, 0)); + FLOW_ATTR_SPEC_ETH(data, priv_flow_attr(priv, NULL, 0, hash_rxq->type)); struct ibv_exp_flow_attr *attr = &data->attr; struct ibv_exp_flow_spec_eth *spec = &data->spec; const uint8_t *mac; @@ -148,7 +149,7 @@ hash_rxq_special_flow_enable(struct hash_rxq *hash_rxq, * This layout is expected by libibverbs. */ assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec); - hash_rxq_flow_attr(hash_rxq, attr, sizeof(data)); + priv_flow_attr(priv, attr, sizeof(data), hash_rxq->type); /* The first specification must be Ethernet. */ assert(spec->type == IBV_EXP_FLOW_SPEC_ETH); assert(spec->size == sizeof(*spec)); diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 27e3bcc..641ee07 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -210,27 +210,27 @@ const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key); * information from hash_rxq_init[]. Nothing is written to flow_attr when * flow_attr_size is not large enough, but the required size is still returned. * - * @param[in] hash_rxq - * Pointer to hash RX queue. + * @param priv + * Pointer to private structure. * @param[out] flow_attr * Pointer to flow attribute structure to fill. Note that the allocated * area must be larger and large enough to hold all flow specifications. * @param flow_attr_size * Entire size of flow_attr and trailing room for flow specifications. + * @param type + * Hash RX queue type to use for flow steering rule. * * @return * Total size of the flow attribute buffer. No errors are defined. */ size_t -hash_rxq_flow_attr(const struct hash_rxq *hash_rxq, - struct ibv_exp_flow_attr *flow_attr, - size_t flow_attr_size) +priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr, + size_t flow_attr_size, enum hash_rxq_type type) { size_t offset = sizeof(*flow_attr); - enum hash_rxq_type type = hash_rxq->type; const struct hash_rxq_init *init = &hash_rxq_init[type]; - assert(hash_rxq->priv != NULL); + assert(priv != NULL); assert((size_t)type < RTE_DIM(hash_rxq_init)); do { offset += init->flow_spec.hdr.size; @@ -244,7 +244,7 @@ hash_rxq_flow_attr(const struct hash_rxq *hash_rxq, .type = IBV_EXP_FLOW_ATTR_NORMAL, .priority = init->flow_priority, .num_of_specs = 0, - .port = hash_rxq->priv->port, + .port = priv->port, .flags = 0, }; do { diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index d5a5019..c42bb8d 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -270,8 +270
[dpdk-dev] [PATCH 4/5] mlx5: add support for flow director
From: Yaacov Hazan Add support for flow director filters (RTE_FDIR_MODE_PERFECT and RTE_FDIR_MODE_PERFECT_MAC_VLAN modes). This feature requires MLNX_OFED 3.2. Signed-off-by: Yaacov Hazan Signed-off-by: Adrien Mazarguil --- drivers/net/mlx5/Makefile | 6 + drivers/net/mlx5/mlx5.c | 12 + drivers/net/mlx5/mlx5.h | 10 + drivers/net/mlx5/mlx5_defs.h| 11 + drivers/net/mlx5/mlx5_fdir.c| 890 drivers/net/mlx5/mlx5_rxq.c | 6 + drivers/net/mlx5/mlx5_rxtx.h| 7 + drivers/net/mlx5/mlx5_trigger.c | 3 + 8 files changed, 945 insertions(+) create mode 100644 drivers/net/mlx5/mlx5_fdir.c diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 698f072..46a17e0 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -52,6 +52,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxmode.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_vlan.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_stats.c SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_fdir.c # Dependencies. DEPDIRS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += lib/librte_ether @@ -125,6 +126,11 @@ mlx5_autoconf.h: $(RTE_SDK)/scripts/auto-config-h.sh infiniband/verbs.h \ enum IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR \ $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS \ + infiniband/verbs.h \ + enum IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS \ + $(AUTOCONF_OUTPUT) $(SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD):.c=.o): mlx5_autoconf.h diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 6afc873..2646b3f 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -94,6 +94,11 @@ mlx5_dev_close(struct rte_eth_dev *dev) priv_special_flow_disable(priv, HASH_RXQ_FLOW_TYPE_IPV6MULTI); priv_mac_addrs_disable(priv); priv_destroy_hash_rxqs(priv); + + /* Remove flow director elements. */ + priv_fdir_disable(priv); + priv_fdir_delete_filters_list(priv); + /* Prevent crashes when queues are still in use. */ dev->rx_pkt_burst = removed_rx_burst; dev->tx_pkt_burst = removed_tx_burst; @@ -169,6 +174,9 @@ static const struct eth_dev_ops mlx5_dev_ops = { .reta_query = mlx5_dev_rss_reta_query, .rss_hash_update = mlx5_rss_hash_update, .rss_hash_conf_get = mlx5_rss_hash_conf_get, +#ifdef MLX5_FDIR_SUPPORT + .filter_ctrl = mlx5_dev_filter_ctrl, +#endif /* MLX5_FDIR_SUPPORT */ }; static struct { @@ -421,6 +429,10 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) claim_zero(priv_mac_addr_add(priv, 0, (const uint8_t (*)[ETHER_ADDR_LEN]) mac.addr_bytes)); + /* Initialize FD filters list. */ + err = fdir_init_filters_list(priv); + if (err) + goto port_error; #ifndef NDEBUG { char ifname[IF_NAMESIZE]; diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index bc0c7e2..fdd2504 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -120,6 +120,7 @@ struct priv { struct rte_intr_handle intr_handle; /* Interrupt handler. */ unsigned int (*reta_idx)[]; /* RETA index table. */ unsigned int reta_idx_n; /* RETA index size. */ + struct fdir_filter_list *fdir_filter_list; /* Flow director rules. */ rte_spinlock_t lock; /* Lock for control functions. */ }; @@ -215,4 +216,13 @@ int mlx5_vlan_filter_set(struct rte_eth_dev *, uint16_t, int); int mlx5_dev_start(struct rte_eth_dev *); void mlx5_dev_stop(struct rte_eth_dev *); +/* mlx5_fdir.c */ + +int fdir_init_filters_list(struct priv *); +void priv_fdir_delete_filters_list(struct priv *); +void priv_fdir_disable(struct priv *); +void priv_fdir_enable(struct priv *); +int mlx5_dev_filter_ctrl(struct rte_eth_dev *, enum rte_filter_type, +enum rte_filter_op, void *); + #endif /* RTE_PMD_MLX5_H_ */ diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index a7440e7..195440c 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -34,6 +34,8 @@ #ifndef RTE_PMD_MLX5_DEFS_H_ #define RTE_PMD_MLX5_DEFS_H_ +#include "mlx5_autoconf.h" + /* Reported driver name. */ #define MLX5_DRIVER_NAME "librte_pmd_mlx5" @@ -84,4 +86,13 @@ /* Alarm timeout. */ #define MLX5_ALARM_TIMEOUT_US 10 +/* + * Extended flow priorities necessary to support flow director are available + * since MLNX_OFED 3.2. Considering this version adds support for VLAN + * offloads as well, their availability means flow director can be used. + */ +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS +#define MLX5_FDIR_S
[dpdk-dev] [PATCH 5/5] mlx5: add support for RX VLAN stripping
From: Yaacov Hazan Allows HW to strip the 802.1Q header from incoming frames and report it through the mbuf structure. This feature requires MLNX_OFED >= 3.2. Signed-off-by: Yaacov Hazan Signed-off-by: Adrien Mazarguil --- drivers/net/mlx5/mlx5.c | 16 ++- drivers/net/mlx5/mlx5.h | 3 ++ drivers/net/mlx5/mlx5_rxq.c | 15 ++- drivers/net/mlx5/mlx5_rxtx.c | 27 +++ drivers/net/mlx5/mlx5_rxtx.h | 5 +++ drivers/net/mlx5/mlx5_vlan.c | 104 +++ 6 files changed, 168 insertions(+), 2 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 2646b3f..94b4d80 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -170,6 +170,10 @@ static const struct eth_dev_ops mlx5_dev_ops = { .mac_addr_remove = mlx5_mac_addr_remove, .mac_addr_add = mlx5_mac_addr_add, .mtu_set = mlx5_dev_set_mtu, +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, + .vlan_offload_set = mlx5_vlan_offload_set, +#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ .reta_update = mlx5_dev_rss_reta_update, .reta_query = mlx5_dev_rss_reta_query, .rss_hash_update = mlx5_rss_hash_update, @@ -324,7 +328,11 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) #ifdef HAVE_EXP_QUERY_DEVICE exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | - IBV_EXP_DEVICE_ATTR_RX_HASH; + IBV_EXP_DEVICE_ATTR_RX_HASH | +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS | +#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + 0; #endif /* HAVE_EXP_QUERY_DEVICE */ DEBUG("using port %u (%08" PRIx32 ")", port, test); @@ -395,6 +403,12 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE; DEBUG("maximum RX indirection table size is %u", priv->ind_table_max_size); +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + priv->hw_vlan_strip = !!(exp_device_attr.wq_vlan_offloads_cap & +IBV_EXP_RECEIVE_WQ_CVLAN_STRIP); +#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + DEBUG("VLAN stripping is %ssupported", + (priv->hw_vlan_strip ? "" : "not ")); #else /* HAVE_EXP_QUERY_DEVICE */ priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE; diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index fdd2504..585760b 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -101,6 +101,7 @@ struct priv { unsigned int allmulti_req:1; /* All multicast mode requested. */ unsigned int hw_csum:1; /* Checksum offload is supported. */ unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ + unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */ unsigned int vf:1; /* This is a VF device. */ unsigned int pending_alarm:1; /* An alarm is pending. */ /* RX/TX queues. */ @@ -210,6 +211,8 @@ void mlx5_stats_reset(struct rte_eth_dev *); /* mlx5_vlan.c */ int mlx5_vlan_filter_set(struct rte_eth_dev *, uint16_t, int); +void mlx5_vlan_offload_set(struct rte_eth_dev *, int); +void mlx5_vlan_strip_queue_set(struct rte_eth_dev *, uint16_t, int); /* mlx5_trigger.c */ diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 9a70f32..c79ce5c 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -1224,6 +1224,8 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, priv->device_attr.max_qp_wr); DEBUG("priv->device_attr.max_sge is %d", priv->device_attr.max_sge); + /* Configure VLAN stripping. */ + tmpl.vlan_strip = dev->data->dev_conf.rxmode.hw_vlan_strip; attr.wq = (struct ibv_exp_wq_init_attr){ .wq_context = NULL, /* Could be useful in the future. */ .wq_type = IBV_EXP_WQT_RQ, @@ -1238,8 +1240,18 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, MLX5_PMD_SGE_WR_N), .pd = priv->pd, .cq = tmpl.cq, - .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN, + .comp_mask = + IBV_EXP_CREATE_WQ_RES_DOMAIN | +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + IBV_EXP_CREATE_WQ_VLAN_OFFLOADS | +#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + 0, .res_domain = tmpl.rd, +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + .vlan_offloads = (tmpl.vlan_strip ? + IBV_EXP_RECEIVE_WQ_CVLAN_STRIP : +
[dpdk-dev] [PATCH 0/6] Performance optimizations for mlx5
This patchset improves the mlx5 PMD performance by doing better prefetching, by reordering internal structure fields and by removing a few unnecessary operations. Note: should be applied after "Add flow director and RX VLAN stripping support" to avoid conflicts. Nelio Laranjeiro (6): mlx5: prefetch next TX mbuf header and data mlx5: reorder TX/RX queue structure mlx5: remove one indirection level from RX/TX functions mlx5: process offload flags only when requested mlx5: avoid lkey retrieval for inlined packets mlx5: free buffers immediately after completion drivers/net/mlx5/Makefile| 1 + drivers/net/mlx5/mlx5_rxq.c | 12 drivers/net/mlx5/mlx5_rxtx.c | 136 +++ drivers/net/mlx5/mlx5_rxtx.h | 54 ++--- drivers/net/mlx5/mlx5_txq.c | 14 + 5 files changed, 132 insertions(+), 85 deletions(-) -- 2.1.4
[dpdk-dev] [PATCH 1/6] mlx5: prefetch next TX mbuf header and data
From: Nelio Laranjeiro This change improves performance noticeably. Signed-off-by: Nelio Laranjeiro --- drivers/net/mlx5/mlx5_rxtx.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 7585570..bee5ce2 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -443,8 +443,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) unsigned int i; unsigned int max; int err; + struct rte_mbuf *buf = pkts[0]; assert(elts_comp_cd != 0); + /* Prefetch first packet cacheline. */ + rte_prefetch0(buf); txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) @@ -458,7 +461,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (max > pkts_n) max = pkts_n; for (i = 0; (i != max); ++i) { - struct rte_mbuf *buf = pkts[i]; + struct rte_mbuf *buf_next = pkts[i + 1]; unsigned int elts_head_next = (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; @@ -481,6 +484,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) tmp = next; } while (tmp != NULL); } + if (i + 1 < max) + rte_prefetch0(buf_next); /* Request TX completion. */ if (unlikely(--elts_comp_cd == 0)) { elts_comp_cd = txq->elts_comp_cd_init; @@ -502,6 +507,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) uintptr_t addr; uint32_t length; uint32_t lkey; + uintptr_t buf_next_addr; /* Retrieve buffer information. */ addr = rte_pktmbuf_mtod(buf, uintptr_t); @@ -522,6 +528,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) rte_prefetch0((volatile void *) (uintptr_t)addr); RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); + /* Prefetch next buffer data. */ + if (i + 1 < max) { + buf_next_addr = + rte_pktmbuf_mtod(buf_next, uintptr_t); + rte_prefetch0((volatile void *) + (uintptr_t)buf_next_addr); + } /* Put packet into send queue. */ #if MLX5_PMD_MAX_INLINE > 0 if (length <= txq->max_inline) @@ -571,6 +584,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) #endif /* MLX5_PMD_SGE_WR_N > 1 */ } elts_head = elts_head_next; + buf = buf_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += sent_size; -- 2.1.4
[dpdk-dev] [PATCH 2/6] mlx5: reorder TX/RX queue structure
From: Nelio Laranjeiro Remove padding and move important fields to the beginning for better performance. Signed-off-by: Nelio Laranjeiro --- drivers/net/mlx5/mlx5_rxtx.h | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index fde0ca2..4a857d8 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -105,7 +105,6 @@ struct priv; struct rxq { struct priv *priv; /* Back pointer to private data. */ struct rte_mempool *mp; /* Memory Pool for allocations. */ - struct ibv_mr *mr; /* Memory Region (for mp). */ struct ibv_cq *cq; /* Completion Queue. */ struct ibv_exp_wq *wq; /* Work Queue. */ struct ibv_exp_wq_family *if_wq; /* WQ burst interface. */ @@ -117,19 +116,20 @@ struct rxq { unsigned int port_id; /* Port ID for incoming packets. */ unsigned int elts_n; /* (*elts)[] length. */ unsigned int elts_head; /* Current index in (*elts)[]. */ - union { - struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ - struct rxq_elt (*no_sp)[]; /* RX elements. */ - } elts; unsigned int sp:1; /* Use scattered RX elements. */ unsigned int csum:1; /* Enable checksum offloading. */ unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ unsigned int vlan_strip:1; /* Enable VLAN stripping. */ + union { + struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ + struct rxq_elt (*no_sp)[]; /* RX elements. */ + } elts; uint32_t mb_len; /* Length of a mp-issued mbuf. */ - struct mlx5_rxq_stats stats; /* RX queue counters. */ unsigned int socket; /* CPU socket ID for allocations. */ + struct mlx5_rxq_stats stats; /* RX queue counters. */ struct ibv_exp_res_domain *rd; /* Resource Domain. */ struct fdir_queue fdir_queue; /* Flow director queue. */ + struct ibv_mr *mr; /* Memory Region (for mp). */ }; /* Hash RX queue types. */ @@ -248,30 +248,31 @@ typedef uint8_t linear_t[16384]; /* TX queue descriptor. */ struct txq { struct priv *priv; /* Back pointer to private data. */ - struct { - const struct rte_mempool *mp; /* Cached Memory Pool. */ - struct ibv_mr *mr; /* Memory Region (for mp). */ - uint32_t lkey; /* mr->lkey */ - } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ struct ibv_cq *cq; /* Completion Queue. */ struct ibv_qp *qp; /* Queue Pair. */ - struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ - struct ibv_exp_cq_family *if_cq; /* CQ interface. */ + struct txq_elt (*elts)[]; /* TX elements. */ #if MLX5_PMD_MAX_INLINE > 0 uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */ #endif unsigned int elts_n; /* (*elts)[] length. */ - struct txq_elt (*elts)[]; /* TX elements. */ unsigned int elts_head; /* Current index in (*elts)[]. */ unsigned int elts_tail; /* First element awaiting completion. */ unsigned int elts_comp; /* Number of completion requests. */ unsigned int elts_comp_cd; /* Countdown for next completion request. */ unsigned int elts_comp_cd_init; /* Initial value for countdown. */ + struct { + const struct rte_mempool *mp; /* Cached Memory Pool. */ + struct ibv_mr *mr; /* Memory Region (for mp). */ + uint32_t lkey; /* mr->lkey */ + } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ struct mlx5_txq_stats stats; /* TX queue counters. */ + /* Elements used only for init part are here. */ linear_t (*elts_linear)[]; /* Linearized buffers. */ struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ - unsigned int socket; /* CPU socket ID for allocations. */ + struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ + struct ibv_exp_cq_family *if_cq; /* CQ interface. */ struct ibv_exp_res_domain *rd; /* Resource Domain. */ + unsigned int socket; /* CPU socket ID for allocations. */ }; /* mlx5_rxq.c */ -- 2.1.4
[dpdk-dev] [PATCH 3/6] mlx5: remove one indirection level from RX/TX functions
From: Nelio Laranjeiro Avoid dereferencing pointers twice to get to fast Verbs functions by storing them directly in RX/TX queue structures. Signed-off-by: Nelio Laranjeiro --- drivers/net/mlx5/Makefile| 1 + drivers/net/mlx5/mlx5_rxq.c | 12 drivers/net/mlx5/mlx5_rxtx.c | 34 +- drivers/net/mlx5/mlx5_rxtx.h | 23 +-- drivers/net/mlx5/mlx5_txq.c | 14 ++ 5 files changed, 53 insertions(+), 31 deletions(-) diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 46a17e0..39cdf2c 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -67,6 +67,7 @@ CFLAGS += -g CFLAGS += -I. CFLAGS += -D_XOPEN_SOURCE=600 CFLAGS += $(WERROR_FLAGS) +CFLAGS += -Wno-strict-prototypes LDLIBS += -libverbs # A few warnings cannot be avoided in external headers. diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index c79ce5c..537737f 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -901,6 +901,8 @@ rxq_cleanup(struct rxq *rxq) rxq_free_elts_sp(rxq); else rxq_free_elts(rxq); + rxq->poll = NULL; + rxq->recv = NULL; if (rxq->if_wq != NULL) { assert(rxq->priv != NULL); assert(rxq->priv->ctx != NULL); @@ -1343,6 +1345,16 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, *rxq = tmpl; DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl); assert(ret == 0); + /* Assign function in queue. */ +#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS + rxq->poll = rxq->if_cq->poll_length_flags_cvlan; +#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + rxq->poll = rxq->if_cq->poll_length_flags; +#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + if (rxq->sp) + rxq->recv = rxq->if_wq->recv_sg_list; + else + rxq->recv = rxq->if_wq->recv_burst; return 0; error: rxq_cleanup(&tmpl); diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index bee5ce2..63ddc53 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -93,7 +93,7 @@ txq_complete(struct txq *txq) DEBUG("%p: processing %u work requests completions", (void *)txq, elts_comp); #endif - wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp); + wcs_n = txq->poll_cnt(txq->cq, elts_comp); if (unlikely(wcs_n == 0)) return 0; if (unlikely(wcs_n < 0)) { @@ -538,14 +538,14 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Put packet into send queue. */ #if MLX5_PMD_MAX_INLINE > 0 if (length <= txq->max_inline) - err = txq->if_qp->send_pending_inline + err = txq->send_pending_inline (txq->qp, (void *)addr, length, send_flags); else #endif - err = txq->if_qp->send_pending + err = txq->send_pending (txq->qp, addr, length, @@ -567,7 +567,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) goto stop; RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); /* Put SG list into send queue. */ - err = txq->if_qp->send_pending_sg_list + err = txq->send_pending_sg_list (txq->qp, sges, ret.num, @@ -599,7 +599,7 @@ stop: txq->stats.opackets += i; #endif /* Ring QP doorbell. */ - err = txq->if_qp->send_flush(txq->qp); + err = txq->send_flush(txq->qp); if (unlikely(err)) { /* A nonzero value is not supposed to be returned. * Nothing can be done about it. */ @@ -733,14 +733,7 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Sanity checks. */ assert(elts_head < rxq->elts_n); assert(rxq->elts_head < rxq->elts_n); -#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - ret = rxq->if_cq->poll_length_flags_cvlan(rxq->cq, NULL, NULL, - &flags, &vlan_tci); -#else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, - &flags); - (void)vlan_tci; -#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ +
[dpdk-dev] [PATCH 4/6] mlx5: process offload flags only when requested
From: Nelio Laranjeiro Improve performance by processing offloads only when requested by the application. Signed-off-by: Nelio Laranjeiro --- drivers/net/mlx5/mlx5_rxtx.c | 29 - 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 63ddc53..c84ec8c 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -853,14 +853,16 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) NB_SEGS(pkt_buf) = j; PORT(pkt_buf) = rxq->port_id; PKT_LEN(pkt_buf) = pkt_buf_len; - pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); - pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); + if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { + pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); + pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; - pkt_buf->vlan_tci = vlan_tci; - } + if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { + pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; + pkt_buf->vlan_tci = vlan_tci; + } #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ + } /* Return packet. */ *(pkts++) = pkt_buf; @@ -1006,15 +1008,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) NEXT(seg) = NULL; PKT_LEN(seg) = len; DATA_LEN(seg) = len; - seg->packet_type = rxq_cq_to_pkt_type(flags); - seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); + if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { + seg->packet_type = rxq_cq_to_pkt_type(flags); + seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS - if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { - seg->ol_flags |= PKT_RX_VLAN_PKT; - seg->vlan_tci = vlan_tci; - } + if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { + seg->ol_flags |= PKT_RX_VLAN_PKT; + seg->vlan_tci = vlan_tci; + } #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ - + } /* Return packet. */ *(pkts++) = seg; ++pkts_ret; -- 2.1.4
[dpdk-dev] [PATCH 5/6] mlx5: avoid lkey retrieval for inlined packets
From: Nelio Laranjeiro Improves performance as the lkey is not needed by hardware in this case. Signed-off-by: Nelio Laranjeiro --- drivers/net/mlx5/mlx5_rxtx.c | 22 -- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index c84ec8c..579efa0 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -512,16 +512,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Retrieve buffer information. */ addr = rte_pktmbuf_mtod(buf, uintptr_t); length = DATA_LEN(buf); - /* Retrieve Memory Region key for this memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } /* Update element. */ elt->buf = buf; if (txq->priv->vf) @@ -545,12 +535,24 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) send_flags); else #endif + { + /* Retrieve Memory Region key for this memory pool. */ + lkey = txq_mp2mr(txq, txq_mb2mp(buf)); + if (unlikely(lkey == (uint32_t)-1)) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR" + " association", (void *)txq); + /* Clean up TX element. */ + elt->buf = NULL; + goto stop; + } err = txq->send_pending (txq->qp, addr, length, lkey, send_flags); + } if (unlikely(err)) goto stop; #ifdef MLX5_PMD_SOFT_COUNTERS -- 2.1.4
[dpdk-dev] [PATCH 6/6] mlx5: free buffers immediately after completion
From: Nelio Laranjeiro This lowers the amount of cache misses. Signed-off-by: Nelio Laranjeiro --- drivers/net/mlx5/mlx5_rxtx.c | 35 --- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 579efa0..36abeef 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -84,6 +84,7 @@ txq_complete(struct txq *txq) { unsigned int elts_comp = txq->elts_comp; unsigned int elts_tail = txq->elts_tail; + unsigned int elts_free = txq->elts_tail; const unsigned int elts_n = txq->elts_n; int wcs_n; @@ -110,6 +111,25 @@ txq_complete(struct txq *txq) elts_tail += wcs_n * txq->elts_comp_cd_init; if (elts_tail >= elts_n) elts_tail -= elts_n; + + while (elts_free != elts_tail) { + struct txq_elt *elt = &(*txq->elts)[elts_free]; + unsigned int elts_free_next = + (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); + struct rte_mbuf *tmp = elt->buf; + struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; + + RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); + /* Faster than rte_pktmbuf_free(). */ + do { + struct rte_mbuf *next = NEXT(tmp); + + rte_pktmbuf_free_seg(tmp); + tmp = next; + } while (tmp != NULL); + elts_free = elts_free_next; + } + txq->elts_tail = elts_tail; txq->elts_comp = elts_comp; return 0; @@ -464,7 +484,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) struct rte_mbuf *buf_next = pkts[i + 1]; unsigned int elts_head_next = (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); - struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; struct txq_elt *elt = &(*txq->elts)[elts_head]; unsigned int segs = NB_SEGS(buf); #ifdef MLX5_PMD_SOFT_COUNTERS @@ -472,18 +491,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) #endif uint32_t send_flags = 0; - /* Clean up old buffer. */ - if (likely(elt->buf != NULL)) { - struct rte_mbuf *tmp = elt->buf; - - /* Faster than rte_pktmbuf_free(). */ - do { - struct rte_mbuf *next = NEXT(tmp); - - rte_pktmbuf_free_seg(tmp); - tmp = next; - } while (tmp != NULL); - } if (i + 1 < max) rte_prefetch0(buf_next); /* Request TX completion. */ @@ -517,7 +524,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (txq->priv->vf) rte_prefetch0((volatile void *) (uintptr_t)addr); - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); /* Prefetch next buffer data. */ if (i + 1 < max) { buf_next_addr = @@ -567,7 +573,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) &sges); if (ret.length == (unsigned int)-1) goto stop; - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); /* Put SG list into send queue. */ err = txq->send_pending_sg_list (txq->qp, -- 2.1.4
[dpdk-dev] [PATCH v2 0/2] Increase number of next hops for LPM IPv4
This patchset extend next_hop field from 8-bits to 24-bits in LPM library for IPv4. As next_hop field is increased now the maximum number of tbl8s is 2^24. A new rte_lpm_config structure is used so LPM library will allocate exactly the amount of memory which is necessary to hold application?s rules. Changed structures in LPM library: rte_lpm_tbl24_entry and rte_lpm_tbl8_entry to one structure rte_lpm_tbl_entry. Added versioning symbols to functions and updated library and applications that have a dependency on LPM library. Michal Kobylinski (2): lpm: extend ip4 next_hop and add config structure examples: update to use new lpm lib for ip4 app/test/test_func_reentrancy.c |9 +- app/test/test_lpm.c | 274 -- app/test/test_mp_secondary.c|7 +- app/test/test_table_combined.c |1 + app/test/test_table_tables.c|2 + doc/guides/rel_notes/release_2_3.rst|4 + examples/ip_fragmentation/main.c| 23 +- examples/ip_reassembly/main.c | 22 +- examples/l3fwd-power/main.c | 12 +- examples/l3fwd-vf/main.c| 12 +- examples/l3fwd/main.c | 51 +- examples/load_balancer/init.c |8 +- examples/load_balancer/runtime.c|2 +- examples/performance-thread/l3fwd-thread/main.c | 10 +- lib/librte_lpm/Makefile |2 +- lib/librte_lpm/rte_lpm.c| 1157 --- lib/librte_lpm/rte_lpm.h| 232 +++-- lib/librte_lpm/rte_lpm_version.map | 49 +- lib/librte_table/rte_table.h|1 + lib/librte_table/rte_table_lpm.c| 30 +- lib/librte_table/rte_table_lpm.h|4 + 21 files changed, 1546 insertions(+), 366 deletions(-) -- 1.9.1
[dpdk-dev] [PATCH v2 1/2] lpm: extend ip4 next_hop and add config structure
As next_hop field for IPv4 is increased now the maximum number of tbl8s is 2^24. A new rte_lpm_config structure is used so LPM library will allocate exactly the amount of memory which is necessary to hold application?s rules. Changed structures in LPM library: rte_lpm_tbl24_entry and rte_lpm_tbl8_entry to one structure rte_lpm_tbl_entry. Signed-off-by: Michal Kobylinski --- app/test/test_func_reentrancy.c |9 +- app/test/test_lpm.c | 274 +--- app/test/test_mp_secondary.c |7 +- app/test/test_table_combined.c |1 + app/test/test_table_tables.c |2 + doc/guides/rel_notes/release_2_3.rst |4 + lib/librte_lpm/Makefile |2 +- lib/librte_lpm/rte_lpm.c | 1157 ++ lib/librte_lpm/rte_lpm.h | 232 +-- lib/librte_lpm/rte_lpm_version.map | 49 +- lib/librte_table/rte_table.h |1 + lib/librte_table/rte_table_lpm.c | 30 +- lib/librte_table/rte_table_lpm.h |4 + 13 files changed, 1454 insertions(+), 318 deletions(-) diff --git a/app/test/test_func_reentrancy.c b/app/test/test_func_reentrancy.c index dbecc52..592e5d0 100644 --- a/app/test/test_func_reentrancy.c +++ b/app/test/test_func_reentrancy.c @@ -359,6 +359,11 @@ lpm_create_free(__attribute__((unused)) void *arg) { unsigned lcore_self = rte_lcore_id(); struct rte_lpm *lpm; + struct rte_lpm_config config; + + config.max_rules = 4; + config.number_tbl8s = 256; + config.flags = 0; char lpm_name[MAX_STRING_SIZE]; int i; @@ -366,7 +371,7 @@ lpm_create_free(__attribute__((unused)) void *arg) /* create the same lpm simultaneously on all threads */ for (i = 0; i < MAX_ITER_TIMES; i++) { - lpm = rte_lpm_create("fr_test_once", SOCKET_ID_ANY, 4, 0); + lpm = rte_lpm_create("fr_test_once", 0, &config); if ((NULL == lpm) && (rte_lpm_find_existing("fr_test_once") == NULL)) return -1; } @@ -374,7 +379,7 @@ lpm_create_free(__attribute__((unused)) void *arg) /* create mutiple fbk tables simultaneously */ for (i = 0; i < MAX_LPM_ITER_TIMES; i++) { snprintf(lpm_name, sizeof(lpm_name), "fr_test_%d_%d", lcore_self, i); - lpm = rte_lpm_create(lpm_name, SOCKET_ID_ANY, 4, 0); + lpm = rte_lpm_create(lpm_name, SOCKET_ID_ANY, &config); if (NULL == lpm) return -1; diff --git a/app/test/test_lpm.c b/app/test/test_lpm.c index 8b4ded9..69026a4 100644 --- a/app/test/test_lpm.c +++ b/app/test/test_lpm.c @@ -57,7 +57,7 @@ } \ } while(0) -typedef int32_t (* rte_lpm_test)(void); +typedef int32_t (*rte_lpm_test)(void); static int32_t test0(void); static int32_t test1(void); @@ -105,6 +105,7 @@ rte_lpm_test tests[] = { #define NUM_LPM_TESTS (sizeof(tests)/sizeof(tests[0])) #define MAX_DEPTH 32 #define MAX_RULES 256 +#define NUMBER_TBL8S 256 #define PASS 0 /* @@ -115,18 +116,25 @@ int32_t test0(void) { struct rte_lpm *lpm = NULL; + struct rte_lpm_config config; + + config.max_rules = MAX_RULES; + config.number_tbl8s = NUMBER_TBL8S; + config.flags = 0; /* rte_lpm_create: lpm name == NULL */ - lpm = rte_lpm_create(NULL, SOCKET_ID_ANY, MAX_RULES, 0); + lpm = rte_lpm_create(NULL, SOCKET_ID_ANY, &config); TEST_LPM_ASSERT(lpm == NULL); /* rte_lpm_create: max_rules = 0 */ /* Note: __func__ inserts the function name, in this case "test0". */ - lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, 0, 0); + config.max_rules = 0; + lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config); TEST_LPM_ASSERT(lpm == NULL); /* socket_id < -1 is invalid */ - lpm = rte_lpm_create(__func__, -2, MAX_RULES, 0); + config.max_rules = MAX_RULES; + lpm = rte_lpm_create(__func__, -2, &config); TEST_LPM_ASSERT(lpm == NULL); return PASS; @@ -140,11 +148,17 @@ int32_t test1(void) { struct rte_lpm *lpm = NULL; + struct rte_lpm_config config; + + config.number_tbl8s = NUMBER_TBL8S; + config.flags = 0; + int32_t i; /* rte_lpm_free: Free NULL */ for (i = 0; i < 100; i++) { - lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, MAX_RULES - i, 0); + config.max_rules = MAX_RULES - i; + lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config); TEST_LPM_ASSERT(lpm != NULL); rte_lpm_free(lpm); @@ -164,8 +178,13 @@ int32_t test2(void) { struct rte_lpm *lpm = NULL; + struct rte_lpm_config config; - lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, MAX_RULES, 0); + config.max_rules = MAX_RULES; + config.number_tbl8s = NUMBER_TBL8S; + config.
[dpdk-dev] [PATCH 2/2] examples: update to use new lpm lib for ip4
Update other applications to use new structures from LPM library for IPv4. Signed-off-by: Michal Kobylinski --- examples/ip_fragmentation/main.c| 23 ++- examples/ip_reassembly/main.c | 22 +++ examples/l3fwd-power/main.c | 12 -- examples/l3fwd-vf/main.c| 12 -- examples/l3fwd/main.c | 51 - examples/load_balancer/init.c | 8 +++- examples/load_balancer/runtime.c| 2 +- examples/performance-thread/l3fwd-thread/main.c | 10 +++-- 8 files changed, 92 insertions(+), 48 deletions(-) diff --git a/examples/ip_fragmentation/main.c b/examples/ip_fragmentation/main.c index fbc0b8d..367cab2 100644 --- a/examples/ip_fragmentation/main.c +++ b/examples/ip_fragmentation/main.c @@ -266,8 +266,8 @@ l3fwd_simple_forward(struct rte_mbuf *m, struct lcore_queue_conf *qconf, uint8_t queueid, uint8_t port_in) { struct rx_queue *rxq; - uint32_t i, len; - uint8_t next_hop, port_out, ipv6; + uint32_t i, len, next_hop_ip4; + uint8_t next_hop_ip6, port_out, ipv6; int32_t len2; ipv6 = 0; @@ -291,9 +291,9 @@ l3fwd_simple_forward(struct rte_mbuf *m, struct lcore_queue_conf *qconf, ip_dst = rte_be_to_cpu_32(ip_hdr->dst_addr); /* Find destination port */ - if (rte_lpm_lookup(rxq->lpm, ip_dst, &next_hop) == 0 && - (enabled_port_mask & 1 << next_hop) != 0) { - port_out = next_hop; + if (rte_lpm_lookup(rxq->lpm, ip_dst, &next_hop_ip4) == 0 && + (enabled_port_mask & 1 << next_hop_ip4) != 0) { + port_out = next_hop_ip4; /* Build transmission burst for new port */ len = qconf->tx_mbufs[port_out].len; @@ -327,9 +327,9 @@ l3fwd_simple_forward(struct rte_mbuf *m, struct lcore_queue_conf *qconf, ip_hdr = rte_pktmbuf_mtod(m, struct ipv6_hdr *); /* Find destination port */ - if (rte_lpm6_lookup(rxq->lpm6, ip_hdr->dst_addr, &next_hop) == 0 && - (enabled_port_mask & 1 << next_hop) != 0) { - port_out = next_hop; + if (rte_lpm6_lookup(rxq->lpm6, ip_hdr->dst_addr, &next_hop_ip6) == 0 && + (enabled_port_mask & 1 << next_hop_ip6) != 0) { + port_out = next_hop_ip6; /* Build transmission burst for new port */ len = qconf->tx_mbufs[port_out].len; @@ -721,6 +721,7 @@ init_mem(void) struct rte_mempool *mp; struct rte_lpm *lpm; struct rte_lpm6 *lpm6; + struct rte_lpm_config lpm_config; int socket; unsigned lcore_id; @@ -768,7 +769,11 @@ init_mem(void) RTE_LOG(INFO, IP_FRAG, "Creating LPM table on socket %i\n", socket); snprintf(buf, sizeof(buf), "IP_FRAG_LPM_%i", socket); - lpm = rte_lpm_create(buf, socket, LPM_MAX_RULES, 0); + lpm_config.max_rules = LPM6_MAX_RULES; + lpm_config.number_tbl8s = 256; + lpm_config.flags = 0; + + lpm = rte_lpm_create(buf, socket, &lpm_config); if (lpm == NULL) { RTE_LOG(ERR, IP_FRAG, "Cannot create LPM table\n"); return -1; diff --git a/examples/ip_reassembly/main.c b/examples/ip_reassembly/main.c index 741c398..481f757 100644 --- a/examples/ip_reassembly/main.c +++ b/examples/ip_reassembly/main.c @@ -347,7 +347,8 @@ reassemble(struct rte_mbuf *m, uint8_t portid, uint32_t queue, struct rte_ip_frag_death_row *dr; struct rx_queue *rxq; void *d_addr_bytes; - uint8_t next_hop, dst_port; + uint32_t next_hop_ip4; + uint8_t next_hop_ip6, dst_port; rxq = &qconf->rx_queue_list[queue]; @@ -390,9 +391,9 @@ reassemble(struct rte_mbuf *m, uint8_t portid, uint32_t queue, ip_dst = rte_be_to_cpu_32(ip_hdr->dst_addr); /* Find destination port */ - if (rte_lpm_lookup(rxq->lpm, ip_dst, &next_hop) == 0 && - (enabled_port_mask & 1 << next_hop) != 0) { - dst_port = next_hop; + if (rte_lpm_lookup(rxq->lpm, ip_dst, &next_hop_ip4) == 0 && + (enabled_port_mask & 1 << next_hop_ip4) != 0) { + dst_port = next_hop_ip4; } eth_hdr->ether_type = rte_be_to_cpu_16(ETHER_TYPE_IPv4); @@ -427,9 +428,9 @@ reassemble(struct rte_mbuf *m, uint8_t portid, uint32_t queue, } /* Find destination port */ - if (rte_
[dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed config API
On 29 January 2016 at 11:17, Ananyev, Konstantin < konstantin.ananyev at intel.com> wrote: > > > > -Original Message- > > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > > Sent: Friday, January 29, 2016 9:54 AM > > To: Ananyev, Konstantin > > Cc: dev at dpdk.org; Marc Sune; Lu, Wenzhuo; Zhang, Helin; Harish Patil; > Chen, Jing D; Mcnamara, John > > Subject: Re: [dpdk-dev] [PATCH v7 3/5] ethdev: redesign link speed > config API > > > > 2016-01-29 09:47, Ananyev, Konstantin: > > > From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com] > > > > 2016-01-29 09:24, Ananyev, Konstantin: > > > > > Can you avoid modifications in the e1000/base code? > Yes I can. It was just to save intermediate variables and conversions. Previously u16 was used to store numeric throughput in mbps in ethdev, and passed to the inner functions in e1000/base, but it cannot accommodate 100G values, so I moved it to uint32_t. But I can do the conversion outside of e1000/base. > > > > We do not modify (and maintain) that part on our own. > > > > > Instead we take it straight from Intel ND. > > > > > So if you feel like these changes are really necessary - please > submit a patch > > > > > to ND first, and if your changes will be applied, will pick it up > from them. > > > > > > > > I was not aware we can submit a change to ND for Intel base drivers. > > > > What is the procedure please? > > > > > > I meant not to the ND directly, but probably to the freebsd e1000 > kernel driver. > > > As I remember, that is the closest one to what we have. > > > From my understanding (I might be wrong here): > > > If they will be accepted, we should see these changes In next code > drops from ND. > > > > These base drivers are used in several places. > > We are allowed to submit a patch in Linux or FreeBSD but not in DPDK > > where the base driver is verbatim? > > Yes, that's my understanding. > > > We have an agreement to not touch them in DPDK > > Yes. > > > but I still think the > > ND team could consider some patches from dpdk.org. > > I personally think that would be a good thing, > but it is up to ND guys to make such decision. Agree, but: Besides documenting (which is necessary), why not importing the sources via a git submodule pointing to the base project where these files are obtained (e.g. in drivers/net/e1000/ext/ or even in the root folder under ext/ and making symlinks to that in e1000 driver), and just use the raw sources and headers from there? That would make it more explicit that these files should not be modified by DPDK by strictly forbidding so, and that changes should be done via the repository pointed in drivers/net/e1000/ext/, hence following that repository's workflow. marc > > Konstantin > >
[dpdk-dev] [PATCH] pci: Add the class_id support in pci probe
On 01/29/2016 12:10 PM, Panu Matilainen wrote: > On 01/29/2016 11:34 AM, Thomas Monjalon wrote: >> 2016-01-29 11:21, Panu Matilainen: >>> On 01/28/2016 11:38 PM, Thomas Monjalon wrote: 2016-01-13 14:22, Panu Matilainen: > On 01/13/2016 01:55 PM, Bruce Richardson wrote: >> On Thu, Dec 31, 2015 at 09:12:14AM -0800, Stephen Hemminger wrote: >>> On Tue, 29 Dec 2015 10:53:26 +0800 >>> Ziye Yang wrote: >>> This patch is used to add the class_id support for pci_probe since some devices need the class_info (class_code, subclass_code, programming_interface) Signed-off-by: Ziye Yang >>> >>> Since rte_pci is exposed to application this breaks the ABI. >> >> But applications are not going to be defining rte_pci_ids values >> internally, are >> they? That is for drivers to use. Is this really an ABI breakage >> for applications that we >> need to be concerned about? > > There might not be applications using it but drivers are ABI consumers > too - think of 3rd party drivers and such. Drivers are not ABI consumers in the sense that ABI means Application Binary Interface. We are talking about drivers interface here. When establishing the ABI policy we were discussing about applications only. >>> >>> Generally speaking an ABI is an interface between two program (or >>> software if you like) modules, its not specific to "applications". >>> Looking at http://dpdk.org/doc/guides/contributing/versioning.html I see >>> it does only talk about applications, but an ABI consumer can also be >>> another library. A driver calling rte_malloc() is just as much >>> librte_eal ABI consumer as anything else. >>> >>> Now, I understand that drivers use and need interface(s) that >>> applications have no use for or simply cannot use, and those interfaces >>> could be subject to different policies. As an extreme example, the Linux >>> kernel has two isolated ABIs, one is the userland system call interface >>> which is guaranteed to stay forever and the other is kernel module >>> interface, guarantees nothing at all. >>> >>> In DPDK the difference is far muddier than that since all the interfaces >>> live in common, versioned userland DSOs. So if there are two different >>> interfaces following two different policies, it's all the more important >>> to clearly document them. One simple way could be using a different >>> prefix than rte_. >> >> Good suggestion. Or we can simply have different files with a clear >> notice >> in their headers and in the versioning doc. >> It was well split in rte_cryptodev_pmd.h > > Using separate headers is also good. Optimally both? :) > I agree we must allow 3rd party drivers but there is no good reason to try to upgrade DPDK without upgrading/porting the external drivers. If someone does not want to release its driver and keep upgrading DPDK, it is acceptable IMHO to force an upgrade of its driver. >>> >>> Note that I've no particular sympathy for 3rd party drivers as such. >>> What I *do* care about is that breakage is made explicit, as in drivers >>> built for an incompatible version refuse to load at all, instead of >>> silently corrupting memory etc. >> >> OK I agree. > > Cool, the rest is just details then. > >> Anyway the ABI versionning does not cover the structure changes. >> What about making a DPDK version check when registering a driver? >> So a binary driver would be clearly bound to a DPDK version. > > That's one possibility. Another way to achieve essentially the same is > to make rte_eal_driver_register() symbol version follow the DPDK > version, in which case a driver built for another version will fail at > dlopen() already. Thinking about this a bit more, symbol versioning doesn't cut it because its not always used (static linkakage) and I guess we should cover that too. Another similar possibility that blocks it at dlopen() level is to munge the actual function name to carry a version, so it becomes something like rte_eal_driver_register_v230() and later _v240() etc. AFAICS its only ever invoked via PMD_REGISTER_DRIVER() so the calling details can conveniently be hidden there. - Panu -
[dpdk-dev] [PATCH v7 5/5] ethdev: add rte_eth_speed_to_bm_flag() to ver. map
On 01/29/2016 02:42 AM, Marc Sune wrote: > Added rte_eth_speed_to_bm_flag() to DPDK2.2 version map. > > Signed-off-by: Marc Sune > --- > lib/librte_ether/rte_ether_version.map | 6 ++ > 1 file changed, 6 insertions(+) > > diff --git a/lib/librte_ether/rte_ether_version.map > b/lib/librte_ether/rte_ether_version.map > index d8db24d..2c14ad7 100644 > --- a/lib/librte_ether/rte_ether_version.map > +++ b/lib/librte_ether/rte_ether_version.map > @@ -117,3 +117,9 @@ DPDK_2.2 { > > local: *; > }; > + > +DPDK_2.3 { > + global: > + > + rte_eth_speed_to_bm_flag; > +}DPDK_2.2; > The version map must be updated in the patch that adds the symbol(s) in question to the code, not separately. - Panu -
[dpdk-dev] [PATCH 0/5] add dpdk packet capture support for tcpdump
This patch set include design to capture dpdk port packets for tcpdump. This patch set include test-pmd changes to verify patch set given in Dependencies 1. Current patch set is dependent on below patch set. Below patch set must be applied before applying current patch set. Dependencies 1: http://dpdk.org/dev/patchwork/patch/9750/ http://dpdk.org/dev/patchwork/patch/9751/ http://dpdk.org/dev/patchwork/patch/9752/ Dependencies 2: Patches 2/5 and 3/5 of current patch set contains pcap based design. So to run and compile these patches, libpcap must be installed and pcap config option should be set to yes i.e. CONFIG_RTE_LIBRTE_PMD_PCAP=y. packet capture flow for tcpdump: Part of design is implemented in secondary process (proc_info.c) and some part in primary process (eal_interrupt.c). Communication between both the processes is provided using socket and rte_ring. [Secondary process:] *Changes are included in patch 3/5 *User should request packet capture via proc_info app command line with port, queue and src ip filter information. Note: As initial development basic src filter option only provided. *proc_info sends port, queue and src ip filter information to primary along with register rx tx cbs message. *proc_info creates two pcap devices for writing ingress and egress packets of port and queue. *Runs in a while loop, dequeue packets sent by primary over shared rte_ring and writes to respective pcap files. [Primary Process]: *Changes are included in patch 4/5. *Create rte_rings and mempool used for communicating packets with secondary. *Create socket, waits on socket for message from secondary. *Upon receiving the register rx tx cbs message, registers rte_eth_rxtx_callbacks for receiving ingress and egress packets of given port and queue. *RX callback: Gets packet, apply src ip filter, for matched packets, duplicate packets will be created from mempool and new duplicated packets will be enqueued to rte_ring for secondary to dequeue and write to pcap. *TX callback: Gets packets, apply src ip filter, for matched packets increments reference counter of the packet, enqueue to other rte_ring for secondary to dequeue and write to pcap. [Secondary Process]: *When secondary is terminated with ctrl+c, secondary sends remove rx tx cbs message to primary. *[Primary Process]: *When primary receives remove rx tx cbs message should take care of removing registered rxtx callbacks. Users who wish to view packets can run "tcpdump -r RX_pcap.pcap/TX_pcap.pcap" to view packets of interest. Running the changes: === 1)Start any primary sample application. ex:sudo ./examples/rxtx_callbacks/build/rxtx_callbacks -c 0x2 -n 2 2)Start proc_info(runs as secondary process by default)application with new parameters for tcpdump. ex: sudo ./build/app/proc_info/dpdk_proc_info -c 0x4 -n 2 -- -p 0x3 --tcpdump '(0,0)(1,0)' --src-ip-filter="2.2.2.2" 3)Start traffic from traffic generator. 4)Now you can view ingress and egress packets of dpdk ports matching src-ip-filter written to /tmp/RX_pcap.pcap and /tmp/TX_pcap.pcap respectively. 5)Stop the secondary process using ctrl+c and rerun it and packet capturing should resume again. Note: Writing to PCAP files will be stopped once the folder size where pcap files exists reaches its max value. Reshma Pattan (5): app/test-pmd: fix nb_rxq and np_txq checks drivers/net/pcap: add public api to create pcap device app/proc_info: add tcpdump support in secondary process lib/librte_eal: add tcpdump support in primary process doc: enhanced doc for tcpdump feature app/proc_info/main.c| 454 ++- app/test-pmd/cmdline.c | 11 +- app/test-pmd/parameters.c | 14 +- app/test-pmd/testpmd.c | 19 +- doc/guides/prog_guide/env_abstraction_layer.rst | 29 ++- doc/guides/sample_app_ug/proc_info.rst | 34 ++- drivers/net/pcap/Makefile |4 +- drivers/net/pcap/rte_eth_pcap.c | 156 +++- drivers/net/pcap/rte_eth_pcap.h | 87 + drivers/net/pcap/rte_pmd_pcap_version.map |8 + lib/librte_eal/linuxapp/eal/Makefile|5 +- lib/librte_eal/linuxapp/eal/eal_interrupts.c| 376 +++- 12 files changed, 1155 insertions(+), 42 deletions(-) create mode 100644 drivers/net/pcap/rte_eth_pcap.h -- 1.7.4.1
[dpdk-dev] [PATCH 1/5] app/test-pmd: fix nb_rxq and np_txq checks
Made testpmd changes to validate nb_rxq/nb_txq zero value changes of librte_ether. Signed-off-by: Reshma Pattan --- app/test-pmd/cmdline.c| 11 +-- app/test-pmd/parameters.c | 14 +- app/test-pmd/testpmd.c| 19 +-- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index 6d28c1b..fa666d2 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * Copyright(c) 2014 6WIND S.A. * All rights reserved. * @@ -1163,17 +1163,16 @@ cmd_config_rx_tx_parsed(void *parsed_result, printf("Please stop all ports first\n"); return; } - if (!strcmp(res->name, "rxq")) { - if (res->value <= 0) { - printf("rxq %d invalid - must be > 0\n", res->value); + if (!res->value && !nb_txq) { + printf("Warning: Either rx or tx queues should non be zero\n"); return; } nb_rxq = res->value; } else if (!strcmp(res->name, "txq")) { - if (res->value <= 0) { - printf("txq %d invalid - must be > 0\n", res->value); + if (!res->value && !nb_rxq) { + printf("Warning: Either rx or tx queues should non be zero\n"); return; } nb_txq = res->value; diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c index 4b421c8..55572eb 100644 --- a/app/test-pmd/parameters.c +++ b/app/test-pmd/parameters.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -810,22 +810,26 @@ launch_args_parse(int argc, char** argv) rss_hf = ETH_RSS_UDP; if (!strcmp(lgopts[opt_idx].name, "rxq")) { n = atoi(optarg); - if (n >= 1 && n <= (int) MAX_QUEUE_ID) + if (n >= 0 && n <= (int) MAX_QUEUE_ID) nb_rxq = (queueid_t) n; else rte_exit(EXIT_FAILURE, "rxq %d invalid - must be" - " >= 1 && <= %d\n", n, + " >= 0 && <= %d\n", n, (int) MAX_QUEUE_ID); } if (!strcmp(lgopts[opt_idx].name, "txq")) { n = atoi(optarg); - if (n >= 1 && n <= (int) MAX_QUEUE_ID) + if (n >= 0 && n <= (int) MAX_QUEUE_ID) nb_txq = (queueid_t) n; else rte_exit(EXIT_FAILURE, "txq %d invalid - must be" - " >= 1 && <= %d\n", n, + " >= 0 && <= %d\n", n, (int) MAX_QUEUE_ID); } + if (!nb_rxq && !nb_txq) { + rte_exit(EXIT_FAILURE, "Either rx or tx queues should " + "be non-zero\n"); + } if (!strcmp(lgopts[opt_idx].name, "burst")) { n = atoi(optarg); if ((n >= 1) && (n <= MAX_PKT_BURST)) diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index 1319917..4c8afba 100644 --- a/app/test-pmd/testpmd.c +++ b/app/test-pmd/testpmd.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -608,6 +608,7 @@ init_fwd_streams(void) portid_t pid; struct rte_port *port; streamid_t sm_id, nb_fwd_streams_new; + queueid_t q; /* set socket id according to numa or not */ FOREACH_PORT(pid, ports) { @@ -643,7 +644,12 @@ init_fwd_streams(void) } } - nb_fwd_streams_new = (streamid_t)(nb_ports * nb_rxq); + q = RTE_MAX(nb_rxq, nb_txq); + if (q == 0) { + printf("Fail:Cannot allocate fwd streams as number of queues is 0\n"); +
[dpdk-dev] [PATCH 2/5] drivers/net/pcap: add public api to create pcap device
Added new public api to create pcap device from pcaps. Added new header file for API declaration. Added new public api to version map Signed-off-by: Reshma Pattan --- drivers/net/pcap/Makefile |4 +- drivers/net/pcap/rte_eth_pcap.c | 156 +--- drivers/net/pcap/rte_eth_pcap.h | 87 drivers/net/pcap/rte_pmd_pcap_version.map |8 ++ 4 files changed, 236 insertions(+), 19 deletions(-) create mode 100644 drivers/net/pcap/rte_eth_pcap.h diff --git a/drivers/net/pcap/Makefile b/drivers/net/pcap/Makefile index b41d8a2..8e424bf 100644 --- a/drivers/net/pcap/Makefile +++ b/drivers/net/pcap/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # Copyright(c) 2014 6WIND S.A. # All rights reserved. # @@ -53,7 +53,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_PMD_PCAP) += rte_eth_pcap.c # # Export include files # -SYMLINK-y-include += +SYMLINK-y-include += rte_eth_pcap.h # this lib depends upon: DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_PCAP) += lib/librte_mbuf diff --git a/drivers/net/pcap/rte_eth_pcap.c b/drivers/net/pcap/rte_eth_pcap.c index f9230eb..1da7913 100644 --- a/drivers/net/pcap/rte_eth_pcap.c +++ b/drivers/net/pcap/rte_eth_pcap.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * Copyright(c) 2014 6WIND S.A. * All rights reserved. * @@ -44,7 +44,7 @@ #include -#include +#include "rte_eth_pcap.h" #define RTE_ETH_PCAP_SNAPSHOT_LEN 65535 #define RTE_ETH_PCAP_SNAPLEN ETHER_MAX_JUMBO_FRAME_LEN @@ -85,21 +85,6 @@ struct pcap_tx_queue { char type[ETH_PCAP_ARG_MAXLEN]; }; -struct rx_pcaps { - unsigned num_of_rx; - pcap_t *pcaps[RTE_PMD_RING_MAX_RX_RINGS]; - const char *names[RTE_PMD_RING_MAX_RX_RINGS]; - const char *types[RTE_PMD_RING_MAX_RX_RINGS]; -}; - -struct tx_pcaps { - unsigned num_of_tx; - pcap_dumper_t *dumpers[RTE_PMD_RING_MAX_TX_RINGS]; - pcap_t *pcaps[RTE_PMD_RING_MAX_RX_RINGS]; - const char *names[RTE_PMD_RING_MAX_RX_RINGS]; - const char *types[RTE_PMD_RING_MAX_RX_RINGS]; -}; - struct pmd_internals { struct pcap_rx_queue rx_queue[RTE_PMD_RING_MAX_RX_RINGS]; struct pcap_tx_queue tx_queue[RTE_PMD_RING_MAX_TX_RINGS]; @@ -875,6 +860,143 @@ error: return -1; } +int +rte_eth_from_pcapsndumpers(const char *name, + struct rx_pcaps *rx_queues, + const unsigned nb_rx_queues, + struct tx_pcaps *tx_queues, + const unsigned nb_tx_queues, + const unsigned numa_node) +{ + struct rte_eth_dev_data *data = NULL; + struct pmd_internals *internals = NULL; + struct rte_eth_dev *eth_dev = NULL; + unsigned i; + pcap_dumper_t *dumper; + pcap_t *pcap = NULL; + + hz = rte_get_timer_hz(); + /* do some parameter checking */ + if (!rx_queues && nb_rx_queues > 0) + return -1; + if (!tx_queues && nb_tx_queues > 0) + return -1; + + /* initialize rx and tx pcaps */ + for (i = 0; i < nb_rx_queues; i++) { + if (open_single_rx_pcap(rx_queues->names[i], &pcap) < 0) + return -1; + rx_queues->pcaps[i] = pcap; + } + for (i = 0; i < nb_tx_queues; i++) { + if (open_single_tx_pcap(tx_queues->names[i], &dumper) < 0) + return -1; + tx_queues->dumpers[i] = dumper; + } + + RTE_LOG(INFO, PMD, "Creating pcap-backed ethdev on numa socket %u\n", numa_node); + + /* now do all data allocation - for eth_dev structure, dummy pci driver +* and internal (private) data +*/ + data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node); + if (!data) + goto error; + + if (nb_rx_queues) { + data->rx_queues = rte_zmalloc_socket(name, sizeof(void *) * nb_rx_queues, + 0, numa_node); + if (!data->rx_queues) + goto error; + } + + if (nb_tx_queues) { + data->tx_queues = rte_zmalloc_socket(name, sizeof(void *) * nb_tx_queues, + 0, numa_node); + if (data->tx_queues == NULL) + goto error; + } + + internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node); + if (!internals) + goto error; + + /* reserve an ethdev entry */ + eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL); + if (!eth_dev) + goto error; + + /* check length of device name */ + if ((strlen(eth_dev->data->name) + 1) > sizeof(data->name)) + goto error
[dpdk-dev] [PATCH 3/5] app/proc_info: add tcpdump support in secondary process
Added "--tcupdump2 and "--src-ip-filter" command line options for tcpdump support. Added pcap device creation and writing of packets to pcap device for tcpdump. Added socket functionality to communicate with primary process. Signed-off-by: Reshma Pattan --- app/proc_info/main.c | 454 +- 1 files changed, 451 insertions(+), 3 deletions(-) diff --git a/app/proc_info/main.c b/app/proc_info/main.c index 6448d7b..9be1a37 100644 --- a/app/proc_info/main.c +++ b/app/proc_info/main.c @@ -1,7 +1,7 @@ /* * BSD LICENSE * - * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,8 +38,25 @@ #include #include #include +#include #include #include +#include +#include +#include +#include + +/* sys/un.h with __USE_MISC uses strlen, which is unsafe */ +#ifdef __USE_MISC +#define REMOVED_USE_MISC +#undef __USE_MISC +#endif +#include +/* make sure we redefine __USE_MISC only if it was previously undefined */ +#ifdef REMOVED_USE_MISC +#define __USE_MISC +#undef REMOVED_USE_MISC +#endif #include #include @@ -58,11 +75,42 @@ #include #include #include +#include + +#ifdef RTE_LIBRTE_PMD_PCAP +#include +#endif /* Maximum long option length for option parsing. */ #define MAX_LONG_OPT_SZ 64 #define RTE_LOGTYPE_APP RTE_LOGTYPE_USER1 +#define APP_ARG_TCPDUMP_MAX_TUPLES 54 +#define TCPDUMP_SOCKET_PATH "%s/tcpdump_mp_socket" +#define CMSGLEN CMSG_LEN(sizeof(int)) +#define TX_DESC_PER_QUEUE 512 +#define RX_DESC_PER_QUEUE 128 +#define BURST_SIZE 32 +#define MBUF_PER_POOL 65535 +#define MBUF_POOL_CACHE_SIZE 250 + + +uint32_t src_ip_filter; + +int socket_fd = -1; +enum tcpdump_msg_type { + REMOVE_RXTX_CBS = 1, + REGISTER_RXTX_CBS = 2 +}; + +enum rx_tx_type { + RX = 1, + TX = 2, + RX_TX_TYPES = 2 +}; + +static struct rte_eth_conf port_conf_default; +volatile uint8_t quit_signal; /**< mask of enabled ports */ static uint32_t enabled_port_mask; /**< Enable stats. */ @@ -76,13 +124,46 @@ static uint32_t reset_xstats; /**< Enable memory info. */ static uint32_t mem_info; +bool is_tcpdump_enabled; +static volatile struct tcpdump_app_stats { + struct { + uint64_t dequeue_pkts; + uint64_t tx_pkts; + uint64_t freed_pkts; + } in __rte_cache_aligned; + struct { + uint64_t dequeue_pkts; + uint64_t tx_pkts; + uint64_t freed_pkts; + } out __rte_cache_aligned; +} tcpdump_app_stats __rte_cache_aligned; + +struct tcpdump_port_queue_tuples { + int num_pq_tuples; + uint8_t port_id[APP_ARG_TCPDUMP_MAX_TUPLES]; + uint8_t queue_id[APP_ARG_TCPDUMP_MAX_TUPLES]; +} __rte_cache_aligned; + +int pcap_vdev_port_id[RX_TX_TYPES]; + +static struct tcpdump_port_queue_tuples tcpdump_pq_t; + +struct output_buffer { + unsigned count; + struct rte_mbuf *mbufs[BURST_SIZE]; +}; + /**< display usage */ + static void proc_info_usage(const char *prgname) { printf("%s [EAL options] -- -p PORTMASK\n" " -m to display DPDK memory zones, segments and TAILQ information\n" " -p PORTMASK: hexadecimal bitmask of ports to retrieve stats for\n" + " --tcpdump (port,queue): port and queue info for capturing packets " + "for tcpdump\n" + " --src-ip-filter \"A.B.C.D\": src ip for tcpdump filtering\n" " --stats: to display port statistics, enabled by default\n" " --xstats: to display extended port statistics, disabled by " "default\n" @@ -117,14 +198,79 @@ parse_portmask(const char *portmask) } +static int +parse_tcpdump(const char *q_arg) +{ + char s[256]; + const char *p, *p0 = q_arg; + char *end; + + enum fieldnames { + FLD_PORT = 0, + FLD_QUEUE, + _NUM_FLD + }; + + unsigned long int_fld[_NUM_FLD]; + char *str_fld[_NUM_FLD]; + int i; + unsigned size; + uint32_t nb_tcpdump_params; + + nb_tcpdump_params = 0; + + while ((p = strchr(p0, '(')) != NULL) { + ++p; + p0 = strchr(p, ')'); + if (p0 == NULL) + return -1; + + size = p0 - p; + if (size >= sizeof(s)) + return -1; + + snprintf(s, sizeof(s), "%.*s", size, p); + if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != _NUM_FLD) + return -1; + for (i = 0; i < _NUM_FLD; i++) { + errno = 0; + int_fld[i] = strtoul(str_fld[i], &end, 0); + if (errno != 0 || end == str_fld[i] || int_fld[i
[dpdk-dev] [PATCH 5/5] doc: update doc for tcpdump feature
Added tcpdump design changes to proc_info section of sample application user guide. Added tcpdump design changes env abstraction layer section of programmers guide. Signed-off-by: Reshma Pattan --- doc/guides/prog_guide/env_abstraction_layer.rst | 29 +++- doc/guides/sample_app_ug/proc_info.rst | 34 +-- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst index 89feb69..7455664 100644 --- a/doc/guides/prog_guide/env_abstraction_layer.rst +++ b/doc/guides/prog_guide/env_abstraction_layer.rst @@ -1,5 +1,5 @@ .. BSD LICENSE -Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +Copyright(c) 2010-2016 Intel Corporation. All rights reserved. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -174,6 +174,8 @@ User Space Interrupt Event The EAL creates a host thread to poll the UIO device file descriptors to detect the interrupts. Callbacks can be registered or unregistered by the EAL functions for a specific interrupt event and are called in the host thread asynchronously. +The EAL thread also polls for tcpdump fd. If fd polled matches tcpdump fd, will initiate tcpdump +processing. The EAL also allows timed callbacks to be used in the same way as for NIC interrupts. .. note:: @@ -207,6 +209,31 @@ The eth_dev driver takes responsibility to program the latter mapping. The RX interrupt are controlled/enabled/disabled by ethdev APIs - 'rte_eth_dev_rx_intr_*'. They return failure if the PMD hasn't support them yet. The intr_conf.rxq flag is used to turn on the capability of RX interrupt per device. ++ Tcpdump +Create socket for tcpdump connection with secondary and registers socket with +tcpdump epoll event. tcpdump event will be polled as part of interrupt thread. + +Creates mempool and two rte_rings for packets duplication and sharing packet info +with secondary respectively. + +Upon receiving tcpdump event, receive either register RX/TX callbacks or remove +RX/TX callbacks message from secondary on socket. + +For Register RX/TX callbacks message: +Process port, queue and src ip filter options of message. +Registers rte_eth_rxtx_callbacks for a given port and queue information. + +Rx callback will apply src ip filter filter on recived packets and matched packets will be +duplicated on to new mempool. Duplicated packets will be enqueued to one of the rte_ring for +secondary process to use. + +TX callback will apply src ip filter filter on original packets and if packet matches, +reference count of packet will be incremented and enqueued to second rte_ring for +secondary process to use. + +For Remove RX/TX callbacks message: +Removes the registered rte_erth_rxtx callbacks for the given port and queue. + Blacklisting diff --git a/doc/guides/sample_app_ug/proc_info.rst b/doc/guides/sample_app_ug/proc_info.rst index 542950b..42e8366 100644 --- a/doc/guides/sample_app_ug/proc_info.rst +++ b/doc/guides/sample_app_ug/proc_info.rst @@ -1,6 +1,6 @@ .. BSD LICENSE -Copyright(c) 2015 Intel Corporation. All rights reserved. +Copyright(c) 2015-2016 Intel Corporation. All rights reserved. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,19 +39,47 @@ statistics, resetting port statistics and printing DPDK memory information. This application extends the original functionality that was supported by dump_cfg. +This application now supports tcpdump feature. Using command line options +user will pass on port, queue information along with src ip as filtering option. + +Overview of tcpdump flow +-- +*Parse given port, queue and src ip filtering options. + +*Creates socket to communicate port, queue & src ip filter information with +primary process to register RX/TX callbacks. + +*Creates two pcap devices only with tx queues, one for ingress and other for +egress tcpdump packets. + +Fetches packets from rte_rings which are used between primary and +proc_info for sharing ingress and egress packets of given port and queue. + +Writes on ingress and egress packets of particular port and queue +to ingress and egress pcap devices respectively. + +Upon application termination i.e ctrl+c, sends on socket request to remove +RX/TX callbacks registerd by primary. + Running the Application --- The application has a number of command line options: .. code-block:: console - ./$(RTE_TARGET)/app/dpdk_proc_info -- -m | [-p PORTMASK] [--stats | --xstats | - --stats-reset | --xstats-reset] + ./$(RTE_TARGET)/app/dpdk_proc_info -- -m | [-p PORTMASK] [--tcpdump (port,queue)] [ --src-ip-filter \"A.B.C.D\"] +[--stats | --xstats | --stats-reset | --xstats-reset] Parameters
[dpdk-dev] [PATCH 4/5] lib/librte_eal: add tcpdump support in primary process
Added tcpdump functionality to eal interrupt thread. Enhanced interrupt thread to support tcpdump socket and message processing from secondary. Created new mempool and rings to handle packets of tcpdump. Added rte_eth_rxtx_callbacks for ingress/egress packets processing for tcpdump. Added functionality to remove registered rte_eth_rxtx_callbacks once secondary process is terminated. Signed-off-by: Reshma Pattan --- lib/librte_eal/linuxapp/eal/Makefile |5 +- lib/librte_eal/linuxapp/eal/eal_interrupts.c | 376 +- 2 files changed, 378 insertions(+), 3 deletions(-) diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile index 26eced5..a9a2c36 100644 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ b/lib/librte_eal/linuxapp/eal/Makefile @@ -1,6 +1,6 @@ # BSD LICENSE # -# Copyright(c) 2010-2015 Intel Corporation. All rights reserved. +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +45,9 @@ CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include CFLAGS += -I$(RTE_SDK)/lib/librte_ring CFLAGS += -I$(RTE_SDK)/lib/librte_mempool CFLAGS += -I$(RTE_SDK)/lib/librte_ivshmem +CFLAGS += -I$(RTE_SDK)/lib/librte_mbuf +CFLAGS += -I$(RTE_SDK)/lib/librte_ether +CFLAGS += -I$(RTE_SDK)/lib/librte_net CFLAGS += $(WERROR_FLAGS) -O3 # specific to linuxapp exec-env diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c index 06b26a9..c370dc6 100644 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -1,7 +1,7 @@ /*- * BSD LICENSE * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -45,7 +45,11 @@ #include #include #include +#include +#include #include +#include +#include #include #include @@ -65,15 +69,41 @@ #include #include #include +#include +#include +#include +#include +#include #include "eal_private.h" #include "eal_vfio.h" #include "eal_thread.h" +#include "eal_internal_cfg.h" #define EAL_INTR_EPOLL_WAIT_FOREVER (-1) #define NB_OTHER_INTR 1 +#define TCPDUMP_SOCKET_PATH "%s/tcpdump_mp_socket" +#define TCPDUMP_SOCKET_ERR 0xFF +#define TCPDUMP_REQ 0x1 +#define RING_SIZE 1024 +#define BURST_SIZE 32 +#define NUM_MBUFS 65536 +#define MBUF_CACHE_SIZE 250 +#define MAX_CBS 54 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ +uint32_t src_ip_filter; +int conn_sock; +static int tcpdump_socket_fd; +struct rte_ring *prim_to_sec_rx; +struct rte_ring *prim_to_sec_tx; +struct rte_mempool *tcpdump_pktmbuf_pool; +struct rxtx_cbs { + uint8_t port; + uint16_t queue; + struct rte_eth_rxtx_callback *rx_cb; + struct rte_eth_rxtx_callback *tx_cb; +} cbs[54]; /** * union for pipe fds. @@ -644,6 +674,259 @@ rte_intr_disable(struct rte_intr_handle *intr_handle) return 0; } +static inline void +tcpdump_pktmbuf_duplicate(struct rte_mbuf *mi, struct rte_mbuf *m) +{ + + mi->data_len = m->data_len; + mi->port = m->port; + mi->vlan_tci = m->vlan_tci; + mi->vlan_tci_outer = m->vlan_tci_outer; + mi->tx_offload = m->tx_offload; + mi->hash = m->hash; + + mi->pkt_len = mi->data_len; + mi->ol_flags = m->ol_flags; + mi->packet_type = m->packet_type; + + rte_memcpy(rte_pktmbuf_mtod(mi, void *), + rte_pktmbuf_mtod(m, void *), + rte_pktmbuf_data_len(mi)); + + __rte_mbuf_sanity_check(mi, 1); + __rte_mbuf_sanity_check(m, 0); +} + +static inline struct rte_mbuf * +tcpdump_pktmbuf_clone(struct rte_mbuf *md, struct rte_mempool *mp) +{ + struct rte_mbuf *mc, *mi, **prev; + uint32_t pktlen; + uint8_t nseg; + + mc = rte_pktmbuf_alloc(mp); + if (unlikely(mc == NULL)) + return NULL; + + mi = mc; + prev = &mi->next; + pktlen = md->pkt_len; + nseg = 0; + + do { + nseg++; + tcpdump_pktmbuf_duplicate(mi, md); + *prev = mi; + prev = &mi->next; + } while ((md = md->next) != NULL && + (mi = rte_pktmbuf_alloc(mp)) != NULL); + + *prev = NULL; + mc->nb_segs = nseg; + mc->pkt_len = pktlen; + + /* Allocation of new indirect segment failed */ + if (unlikely(mi == NULL)) { + rte_pktmbuf_free(mc); + return NULL; + } + + __rte_mbuf_sanity_check(mc, 1); + return mc; + +} + +static int +compare_filter(struct rte_mbuf *pkt) +{ + struct ipv4_hdr *pkt_hdr = rte_pktmbuf_mtod_offset(pkt, struct ipv4_hdr *, +