From: Long Li <lon...@microsoft.com> MANA is a PCI device. It uses IB verbs to access hardware through the kernel RDMA layer. This patch introduces build environment and basic device probe functions.
Signed-off-by: Long Li <lon...@microsoft.com> --- Change log: v2: Fix typos. Make the driver build only on x86-64 and Linux. Remove unused header files. Change port definition to uint16_t or uint8_t (for IB). Use getline() in place of fgets() to read and truncate a line. v3: Add meson build check for required functions from RDMA direct verb header file MAINTAINERS | 6 + doc/guides/nics/features/mana.ini | 10 + doc/guides/nics/index.rst | 1 + doc/guides/nics/mana.rst | 66 +++ drivers/net/mana/mana.c | 704 ++++++++++++++++++++++++++++++ drivers/net/mana/mana.h | 210 +++++++++ drivers/net/mana/meson.build | 44 ++ drivers/net/mana/mp.c | 235 ++++++++++ drivers/net/mana/version.map | 3 + drivers/net/meson.build | 1 + 10 files changed, 1280 insertions(+) create mode 100644 doc/guides/nics/features/mana.ini create mode 100644 doc/guides/nics/mana.rst create mode 100644 drivers/net/mana/mana.c create mode 100644 drivers/net/mana/mana.h create mode 100644 drivers/net/mana/meson.build create mode 100644 drivers/net/mana/mp.c create mode 100644 drivers/net/mana/version.map diff --git a/MAINTAINERS b/MAINTAINERS index 18d9edaf88..b8bda48a33 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -837,6 +837,12 @@ F: buildtools/options-ibverbs-static.sh F: doc/guides/nics/mlx5.rst F: doc/guides/nics/features/mlx5.ini +Microsoft mana +M: Long Li <lon...@microsoft.com> +F: drivers/net/mana +F: doc/guides/nics/mana.rst +F: doc/guides/nics/features/mana.ini + Microsoft vdev_netvsc - EXPERIMENTAL M: Matan Azrad <ma...@nvidia.com> F: drivers/net/vdev_netvsc/ diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini new file mode 100644 index 0000000000..b92a27374c --- /dev/null +++ b/doc/guides/nics/features/mana.ini @@ -0,0 +1,10 @@ +; +; Supported features of the 'mana' network poll mode driver. +; +; Refer to default.ini for the full list of available PMD features. +; +[Features] +Linux = Y +Multiprocess aware = Y +Usage doc = Y +x86-64 = Y diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst index 1c94caccea..2725d1d9f0 100644 --- a/doc/guides/nics/index.rst +++ b/doc/guides/nics/index.rst @@ -41,6 +41,7 @@ Network Interface Controller Drivers intel_vf kni liquidio + mana memif mlx4 mlx5 diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst new file mode 100644 index 0000000000..40e18fe810 --- /dev/null +++ b/doc/guides/nics/mana.rst @@ -0,0 +1,66 @@ +.. SPDX-License-Identifier: BSD-3-Clause + Copyright 2022 Microsoft Corporation + +MANA poll mode driver library +============================= + +The MANA poll mode driver library (**librte_net_mana**) implements support +for Microsoft Azure Network Adapter VF in SR-IOV context. + +Features +-------- + +Features of the MANA Ethdev PMD are: + +Prerequisites +------------- + +This driver relies on external libraries and kernel drivers for resources +allocations and initialization. The following dependencies are not part of +DPDK and must be installed separately: + +- **libibverbs** (provided by rdma-core package) + + User space verbs framework used by librte_net_mana. This library provides + a generic interface between the kernel and low-level user space drivers + such as libmana. + + It allows slow and privileged operations (context initialization, hardware + resources allocations) to be managed by the kernel and fast operations to + never leave user space. + +- **libmana** (provided by rdma-core package) + + Low-level user space driver library for Microsoft Azure Network Adapter + devices, it is automatically loaded by libibverbs. + +- **Kernel modules** + + They provide the kernel-side verbs API and low level device drivers that + manage actual hardware initialization and resources sharing with user + space processes. + + Unlike most other PMDs, these modules must remain loaded and bound to + their devices: + + - mana: Ethernet device driver that provides kernel network interfaces. + - mana_ib: InifiniBand device driver. + - ib_uverbs: user space driver for verbs (entry point for libibverbs). + +Driver compilation and testing +------------------------------ + +Refer to the document :ref:`compiling and testing a PMD for a NIC <pmd_build_and_test>` +for details. + +Netvsc PMD arguments +-------------------- + +The user can specify below argument in devargs. + +#. ``mac``: + + Specify the MAC address for this device. If it is set, the driver + probes and loads the NIC with a matching mac address. If it is not + set, the driver probes on all the NICs on the PCI device. The default + value is not set, meaning all the NICs will be probed and loaded. diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c new file mode 100644 index 0000000000..63ec1f75f0 --- /dev/null +++ b/drivers/net/mana/mana.c @@ -0,0 +1,704 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2022 Microsoft Corporation + */ + +#include <unistd.h> +#include <dirent.h> +#include <fcntl.h> +#include <sys/mman.h> + +#include <ethdev_driver.h> +#include <ethdev_pci.h> +#include <rte_kvargs.h> +#include <rte_eal_paging.h> + +#include <infiniband/verbs.h> +#include <infiniband/manadv.h> + +#include <assert.h> + +#include "mana.h" + +/* Shared memory between primary/secondary processes, per driver */ +struct mana_shared_data *mana_shared_data; +const struct rte_memzone *mana_shared_mz; +static const char *MZ_MANA_SHARED_DATA = "mana_shared_data"; + +struct mana_shared_data mana_local_data; + +/* Spinlock for mana_shared_data */ +static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER; + +/* Allocate a buffer on the stack and fill it with a printf format string. */ +#define MKSTR(name, ...) \ + int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \ + char name[mkstr_size_##name + 1]; \ + \ + memset(name, 0, mkstr_size_##name + 1); \ + snprintf(name, sizeof(name), "" __VA_ARGS__) + +int mana_logtype_driver; +int mana_logtype_init; + +const struct eth_dev_ops mana_dev_ops = { +}; + +const struct eth_dev_ops mana_dev_sec_ops = { +}; + +uint16_t +mana_rx_burst_removed(void *dpdk_rxq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + rte_mb(); + return 0; +} + +uint16_t +mana_tx_burst_removed(void *dpdk_rxq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + rte_mb(); + return 0; +} + +static const char *mana_init_args[] = { + "mac", + NULL, +}; + +/* Support of parsing up to 8 mac address from EAL command line */ +#define MAX_NUM_ADDRESS 8 +struct mana_conf { + struct rte_ether_addr mac_array[MAX_NUM_ADDRESS]; + unsigned int index; +}; + +static int mana_arg_parse_callback(const char *key, const char *val, + void *private) +{ + struct mana_conf *conf = (struct mana_conf *)private; + int ret; + + DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index); + + if (conf->index >= MAX_NUM_ADDRESS) { + DRV_LOG(ERR, "Exceeding max MAC address"); + return 1; + } + + ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]); + if (ret) { + DRV_LOG(ERR, "Invalid MAC address %s", val); + return ret; + } + + conf->index++; + + return 0; +} + +static int mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf) +{ + struct rte_kvargs *kvlist; + unsigned int arg_count; + int ret = 0; + + kvlist = rte_kvargs_parse(devargs->args, mana_init_args); + if (!kvlist) { + DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->args); + return -EINVAL; + } + + arg_count = rte_kvargs_count(kvlist, mana_init_args[0]); + if (arg_count > MAX_NUM_ADDRESS) { + ret = -EINVAL; + goto free_kvlist; + } + ret = rte_kvargs_process(kvlist, mana_init_args[0], + mana_arg_parse_callback, conf); + if (ret) { + DRV_LOG(ERR, "error parsing args"); + goto free_kvlist; + } + +free_kvlist: + rte_kvargs_free(kvlist); + return ret; +} + +static int get_port_mac(struct ibv_device *device, unsigned int port, + struct rte_ether_addr *addr) +{ + FILE *file; + int ret = 0; + DIR *dir; + struct dirent *dent; + unsigned int dev_port; + char mac[20]; + + MKSTR(path, "%s/device/net", device->ibdev_path); + + dir = opendir(path); + if (!dir) + return -ENOENT; + + while ((dent = readdir(dir))) { + char *name = dent->d_name; + + MKSTR(filepath, "%s/%s/dev_port", path, name); + + /* Ignore . and .. */ + if ((name[0] == '.') && + ((name[1] == '\0') || + ((name[1] == '.') && (name[2] == '\0')))) + continue; + + file = fopen(filepath, "rb"); + if (!file) + continue; + + ret = fscanf(file, "%u", &dev_port); + fclose(file); + + if (ret != 1) + continue; + + /* Ethernet ports start at 0, IB port start at 1 */ + if (dev_port == port - 1) { + MKSTR(filepath, "%s/%s/address", path, name); + + file = fopen(filepath, "rb"); + if (!file) + continue; + + ret = fscanf(file, "%s", mac); + fclose(file); + + if (ret < 0) + break; + + ret = rte_ether_unformat_addr(mac, addr); + if (ret) + DRV_LOG(ERR, "unrecognized mac addr %s", mac); + break; + } + } + + closedir(dir); + return ret; +} + +static int mana_ibv_device_to_pci_addr(const struct ibv_device *device, + struct rte_pci_addr *pci_addr) +{ + FILE *file; + char *line = NULL; + size_t len = 0; + + MKSTR(path, "%s/device/uevent", device->ibdev_path); + + file = fopen(path, "rb"); + if (!file) + return -errno; + + while (getline(&line, &len, file) != -1) { + /* Extract information. */ + if (sscanf(line, + "PCI_SLOT_NAME=" + "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", + &pci_addr->domain, + &pci_addr->bus, + &pci_addr->devid, + &pci_addr->function) == 4) { + break; + } + } + + free(line); + fclose(file); + return 0; +} + +static int mana_proc_priv_init(struct rte_eth_dev *dev) +{ + struct mana_process_priv *priv; + + priv = rte_zmalloc_socket("mana_proc_priv", + sizeof(struct mana_process_priv), + RTE_CACHE_LINE_SIZE, + dev->device->numa_node); + if (!priv) + return -ENOMEM; + + dev->process_private = priv; + return 0; +} + +static int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd) +{ + struct mana_process_priv *priv = eth_dev->process_private; + + void *addr; + + addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + DRV_LOG(ERR, "Failed to map secondary doorbell port %u", + eth_dev->data->port_id); + return -ENOMEM; + } + + DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr); + + priv->db_page = addr; + + return 0; +} + +/* Initialize shared data for the driver (all devices) */ +static int mana_init_shared_data(void) +{ + int ret = 0; + const struct rte_memzone *secondary_mz; + + rte_spinlock_lock(&mana_shared_data_lock); + + /* Skip if shared data is already initialized */ + if (mana_shared_data) + goto exit; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA, + sizeof(*mana_shared_data), + SOCKET_ID_ANY, 0); + if (!mana_shared_mz) { + DRV_LOG(ERR, "Cannot allocate mana shared data"); + ret = -rte_errno; + goto exit; + } + + mana_shared_data = mana_shared_mz->addr; + memset(mana_shared_data, 0, sizeof(*mana_shared_data)); + rte_spinlock_init(&mana_shared_data->lock); + } else { + secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA); + if (!secondary_mz) { + DRV_LOG(ERR, "Cannot attach mana shared data"); + ret = -rte_errno; + goto exit; + } + + mana_shared_data = secondary_mz->addr; + memset(&mana_local_data, 0, sizeof(mana_local_data)); + } + +exit: + rte_spinlock_unlock(&mana_shared_data_lock); + + return ret; +} + +static int mana_init_once(void) +{ + int ret; + + ret = mana_init_shared_data(); + if (ret) + return ret; + + rte_spinlock_lock(&mana_shared_data->lock); + + switch (rte_eal_process_type()) { + case RTE_PROC_PRIMARY: + if (mana_shared_data->init_done) + break; + + ret = mana_mp_init_primary(); + if (ret) + break; + DRV_LOG(ERR, "MP INIT PRIMARY"); + + mana_shared_data->init_done = 1; + break; + + case RTE_PROC_SECONDARY: + + if (mana_local_data.init_done) + break; + + ret = mana_mp_init_secondary(); + if (ret) + break; + + DRV_LOG(ERR, "MP INIT SECONDARY"); + + mana_local_data.init_done = 1; + break; + + default: + /* Impossible, internal error */ + ret = -EPROTO; + break; + } + + rte_spinlock_unlock(&mana_shared_data->lock); + + return ret; +} + +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev, + struct rte_ether_addr *mac_addr) +{ + struct ibv_device **ibv_list; + int ibv_idx; + struct ibv_context *ctx; + struct ibv_device_attr_ex dev_attr; + int num_devices; + int ret = 0; + uint8_t port; + struct mana_priv *priv = NULL; + struct rte_eth_dev *eth_dev = NULL; + bool found_port; + + ibv_list = ibv_get_device_list(&num_devices); + for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) { + struct ibv_device *ibdev = ibv_list[ibv_idx]; + struct rte_pci_addr pci_addr; + + DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s", + ibdev->name, ibdev->dev_name, ibdev->ibdev_path); + + if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr)) + continue; + + /* Ignore if this IB device is not this PCI device */ + if (pci_dev->addr.domain != pci_addr.domain || + pci_dev->addr.bus != pci_addr.bus || + pci_dev->addr.devid != pci_addr.devid || + pci_dev->addr.function != pci_addr.function) + continue; + + ctx = ibv_open_device(ibdev); + if (!ctx) { + DRV_LOG(ERR, "Failed to open IB device %s", + ibdev->name); + continue; + } + + ret = ibv_query_device_ex(ctx, NULL, &dev_attr); + DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u", + dev_attr.orig_attr.phys_port_cnt); + found_port = false; + + for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt; + port++) { + struct ibv_parent_domain_init_attr attr = {}; + struct rte_ether_addr addr; + char address[64]; + char name[RTE_ETH_NAME_MAX_LEN]; + + ret = get_port_mac(ibdev, port, &addr); + if (ret) + continue; + + if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr)) + continue; + + rte_ether_format_addr(address, sizeof(address), &addr); + DRV_LOG(INFO, "device located port %u address %s", + port, address); + found_port = true; + + priv = rte_zmalloc_socket(NULL, sizeof(*priv), + RTE_CACHE_LINE_SIZE, + SOCKET_ID_ANY); + if (!priv) { + ret = -ENOMEM; + goto failed; + } + + snprintf(name, sizeof(name), "%s_port%d", + pci_dev->device.name, port); + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + int fd; + + eth_dev = rte_eth_dev_attach_secondary(name); + if (!eth_dev) { + DRV_LOG(ERR, "Can't attach to dev %s", + name); + ret = -ENOMEM; + goto failed; + } + + eth_dev->device = &pci_dev->device; + eth_dev->dev_ops = &mana_dev_sec_ops; + ret = mana_proc_priv_init(eth_dev); + if (ret) + goto failed; + priv->process_priv = eth_dev->process_private; + + /* Get the IB FD from the primary process */ + fd = mana_mp_req_verbs_cmd_fd(eth_dev); + if (fd < 0) { + DRV_LOG(ERR, "Failed to get FD %d", fd); + ret = -ENODEV; + goto failed; + } + + ret = mana_map_doorbell_secondary(eth_dev, fd); + if (ret) { + DRV_LOG(ERR, "Failed secondary map %d", + fd); + goto failed; + } + + /* fd is no not used after mapping doorbell */ + close(fd); + + rte_spinlock_lock(&mana_shared_data->lock); + mana_shared_data->secondary_cnt++; + mana_local_data.secondary_cnt++; + rte_spinlock_unlock(&mana_shared_data->lock); + + rte_eth_copy_pci_info(eth_dev, pci_dev); + rte_eth_dev_probing_finish(eth_dev); + + /* Impossible to have more than one port + * matching a MAC address + */ + continue; + } + + eth_dev = rte_eth_dev_allocate(name); + if (!eth_dev) { + ret = -ENOMEM; + goto failed; + } + + eth_dev->data->mac_addrs = + rte_calloc("mana_mac", 1, + sizeof(struct rte_ether_addr), 0); + if (!eth_dev->data->mac_addrs) { + ret = -ENOMEM; + goto failed; + } + + rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs); + + priv->ib_pd = ibv_alloc_pd(ctx); + if (!priv->ib_pd) { + DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port); + ret = -ENOMEM; + goto failed; + } + + /* Create a parent domain with the port number */ + attr.pd = priv->ib_pd; + attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT; + attr.pd_context = (void *)(uint64_t)port; + priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr); + if (!priv->ib_parent_pd) { + DRV_LOG(ERR, + "ibv_alloc_parent_domain failed port %d", + port); + ret = -ENOMEM; + goto failed; + } + + priv->ib_ctx = ctx; + priv->port_id = eth_dev->data->port_id; + priv->dev_port = port; + eth_dev->data->dev_private = priv; + priv->dev_data = eth_dev->data; + + priv->max_rx_queues = dev_attr.orig_attr.max_qp; + priv->max_tx_queues = dev_attr.orig_attr.max_qp; + + priv->max_rx_desc = + RTE_MIN(dev_attr.orig_attr.max_qp_wr, + dev_attr.orig_attr.max_cqe); + priv->max_tx_desc = + RTE_MIN(dev_attr.orig_attr.max_qp_wr, + dev_attr.orig_attr.max_cqe); + + priv->max_send_sge = dev_attr.orig_attr.max_sge; + priv->max_recv_sge = dev_attr.orig_attr.max_sge; + + priv->max_mr = dev_attr.orig_attr.max_mr; + priv->max_mr_size = dev_attr.orig_attr.max_mr_size; + + DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d\n", + name, priv->max_rx_queues, priv->max_rx_desc, + priv->max_send_sge); + + rte_spinlock_lock(&mana_shared_data->lock); + mana_shared_data->primary_cnt++; + rte_spinlock_unlock(&mana_shared_data->lock); + + eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV; + + eth_dev->device = &pci_dev->device; + eth_dev->data->dev_flags |= + RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; + + DRV_LOG(INFO, "device %s at port %u", + name, eth_dev->data->port_id); + + eth_dev->rx_pkt_burst = mana_rx_burst_removed; + eth_dev->tx_pkt_burst = mana_tx_burst_removed; + eth_dev->dev_ops = &mana_dev_ops; + + rte_eth_copy_pci_info(eth_dev, pci_dev); + rte_eth_dev_probing_finish(eth_dev); + } + + /* Secondary process doesn't need an ibv_ctx. It maps the + * doorbell pages using the IB cmd_fd passed from the primary + * process and send messages to primary process for memory + * registartions. + */ + if (!found_port || rte_eal_process_type() == RTE_PROC_SECONDARY) + ibv_close_device(ctx); + } + + ibv_free_device_list(ibv_list); + return 0; + +failed: + /* Free the resource for the port failed */ + if (priv) { + if (priv->ib_parent_pd) + ibv_dealloc_pd(priv->ib_parent_pd); + + if (priv->ib_pd) + ibv_dealloc_pd(priv->ib_pd); + } + + if (eth_dev) + rte_eth_dev_release_port(eth_dev); + + rte_free(priv); + + ibv_close_device(ctx); + ibv_free_device_list(ibv_list); + + return ret; +} + +static int mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) +{ + struct rte_devargs *args = pci_dev->device.devargs; + struct mana_conf conf = {}; + unsigned int i; + int ret; + + if (args && args->args) { + ret = mana_parse_args(args, &conf); + if (ret) { + DRV_LOG(ERR, "failed to parse parameters args = %s", + args->args); + return ret; + } + } + + ret = mana_init_once(); + if (ret) { + DRV_LOG(ERR, "Failed to init PMD global data %d", ret); + return ret; + } + + /* If there are no driver parameters, probe on all ports */ + if (!conf.index) + return mana_pci_probe_mac(pci_drv, pci_dev, NULL); + + for (i = 0; i < conf.index; i++) { + ret = mana_pci_probe_mac(pci_drv, pci_dev, &conf.mac_array[i]); + if (ret) + return ret; + } + + return 0; +} + +static int mana_dev_uninit(struct rte_eth_dev *dev) +{ + RTE_SET_USED(dev); + return 0; +} + +static int mana_pci_remove(struct rte_pci_device *pci_dev) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_spinlock_lock(&mana_shared_data_lock); + + rte_spinlock_lock(&mana_shared_data->lock); + + RTE_VERIFY(mana_shared_data->primary_cnt > 0); + mana_shared_data->primary_cnt--; + if (!mana_shared_data->primary_cnt) { + DRV_LOG(DEBUG, "mp uninit primary"); + mana_mp_uninit_primary(); + } + + rte_spinlock_unlock(&mana_shared_data->lock); + + /* Also free the shared memory if this is the last */ + if (!mana_shared_data->primary_cnt) { + DRV_LOG(DEBUG, "free shared memezone data"); + rte_memzone_free(mana_shared_mz); + } + + rte_spinlock_unlock(&mana_shared_data_lock); + } else { + rte_spinlock_lock(&mana_shared_data_lock); + + rte_spinlock_lock(&mana_shared_data->lock); + RTE_VERIFY(mana_shared_data->secondary_cnt > 0); + mana_shared_data->secondary_cnt--; + rte_spinlock_unlock(&mana_shared_data->lock); + + RTE_VERIFY(mana_local_data.secondary_cnt > 0); + mana_local_data.secondary_cnt--; + if (!mana_local_data.secondary_cnt) { + DRV_LOG(DEBUG, "mp uninit secondary"); + mana_mp_uninit_secondary(); + } + + rte_spinlock_unlock(&mana_shared_data_lock); + } + + return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit); +} + +static const struct rte_pci_id mana_pci_id_map[] = { + { + RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, + PCI_DEVICE_ID_MICROSOFT_MANA) + }, +}; + +static struct rte_pci_driver mana_pci_driver = { + .driver = { + .name = "mana_pci", + }, + .id_table = mana_pci_id_map, + .probe = mana_pci_probe, + .remove = mana_pci_remove, + .drv_flags = RTE_PCI_DRV_INTR_RMV, +}; + +RTE_INIT(rte_mana_pmd_init) +{ + rte_pci_register(&mana_pci_driver); +} + +RTE_PMD_EXPORT_NAME(net_mana, __COUNTER__); +RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map); +RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib"); +RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE); +RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE); diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h new file mode 100644 index 0000000000..e30c030b4e --- /dev/null +++ b/drivers/net/mana/mana.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2022 Microsoft Corporation + */ + +#ifndef __MANA_H__ +#define __MANA_H__ + +enum { + PCI_VENDOR_ID_MICROSOFT = 0x1414, +}; + +enum { + PCI_DEVICE_ID_MICROSOFT_MANA = 0x00ba, +}; + +/* Shared data between primary/secondary processes */ +struct mana_shared_data { + rte_spinlock_t lock; + int init_done; + unsigned int primary_cnt; + unsigned int secondary_cnt; +}; + +#define MIN_RX_BUF_SIZE 1024 +#define MAX_FRAME_SIZE RTE_ETHER_MAX_LEN +#define BNIC_MAX_MAC_ADDR 1 + +#define BNIC_DEV_RX_OFFLOAD_SUPPORT ( \ + DEV_RX_OFFLOAD_CHECKSUM | \ + DEV_RX_OFFLOAD_RSS_HASH) + +#define BNIC_DEV_TX_OFFLOAD_SUPPORT ( \ + RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \ + RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \ + RTE_ETH_TX_OFFLOAD_TCP_CKSUM | \ + RTE_ETH_TX_OFFLOAD_UDP_CKSUM | \ + RTE_ETH_TX_OFFLOAD_TCP_TSO) + +#define INDIRECTION_TABLE_NUM_ELEMENTS 64 +#define TOEPLITZ_HASH_KEY_SIZE_IN_BYTES 40 +#define BNIC_ETH_RSS_SUPPORT ( \ + ETH_RSS_IPV4 | \ + ETH_RSS_NONFRAG_IPV4_TCP | \ + ETH_RSS_NONFRAG_IPV4_UDP | \ + ETH_RSS_IPV6 | \ + ETH_RSS_NONFRAG_IPV6_TCP | \ + ETH_RSS_NONFRAG_IPV6_UDP) + +#define MIN_BUFFERS_PER_QUEUE 64 +#define MAX_RECEIVE_BUFFERS_PER_QUEUE 256 +#define MAX_SEND_BUFFERS_PER_QUEUE 256 + +struct mana_process_priv { + void *db_page; +}; + +struct mana_priv { + struct rte_eth_dev_data *dev_data; + struct mana_process_priv *process_priv; + int num_queues; + + /* DPDK port */ + uint16_t port_id; + + /* IB device port */ + uint8_t dev_port; + + struct ibv_context *ib_ctx; + struct ibv_pd *ib_pd; + struct ibv_pd *ib_parent_pd; + struct ibv_rwq_ind_table *ind_table; + uint8_t ind_table_key[40]; + struct ibv_qp *rwq_qp; + void *db_page; + int max_rx_queues; + int max_tx_queues; + int max_rx_desc; + int max_tx_desc; + int max_send_sge; + int max_recv_sge; + int max_mr; + uint64_t max_mr_size; +}; + +struct mana_txq_desc { + struct rte_mbuf *pkt; + uint32_t wqe_size_in_bu; +}; + +struct mana_rxq_desc { + struct rte_mbuf *pkt; + uint32_t wqe_size_in_bu; +}; + +struct mana_gdma_queue { + void *buffer; + uint32_t count; /* in entries */ + uint32_t size; /* in bytes */ + uint32_t id; + uint32_t head; + uint32_t tail; +}; + +struct mana_stats { + uint64_t packets; + uint64_t bytes; + uint64_t errors; + uint64_t nombuf; +}; + +#define MANA_MR_BTREE_PER_QUEUE_N 64 +struct mana_txq { + struct mana_priv *priv; + uint32_t num_desc; + struct ibv_cq *cq; + struct ibv_qp *qp; + + struct mana_gdma_queue gdma_sq; + struct mana_gdma_queue gdma_cq; + + uint32_t tx_vp_offset; + + /* For storing pending requests */ + struct mana_txq_desc *desc_ring; + + /* desc_ring_head is where we put pending requests to ring, + * completion pull off desc_ring_tail + */ + uint32_t desc_ring_head, desc_ring_tail; + + struct mana_stats stats; + unsigned int socket; +}; + +struct mana_rxq { + struct mana_priv *priv; + uint32_t num_desc; + struct rte_mempool *mp; + struct ibv_cq *cq; + struct ibv_wq *wq; + + /* For storing pending requests */ + struct mana_rxq_desc *desc_ring; + + /* desc_ring_head is where we put pending requests to ring, + * completion pull off desc_ring_tail + */ + uint32_t desc_ring_head, desc_ring_tail; + + struct mana_gdma_queue gdma_rq; + struct mana_gdma_queue gdma_cq; + + struct mana_stats stats; + + unsigned int socket; +}; + +extern int mana_logtype_driver; +extern int mana_logtype_init; + +#define DRV_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ## level, mana_logtype_driver, "%s(): " fmt "\n", \ + __func__, ## args) + +#define PMD_INIT_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ## level, mana_logtype_init, "%s(): " fmt "\n",\ + __func__, ## args) + +#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>") + +const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev); + +uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); + +uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); + +/** Request timeout for IPC. */ +#define MANA_MP_REQ_TIMEOUT_SEC 5 + +/* Request types for IPC. */ +enum mana_mp_req_type { + MANA_MP_REQ_VERBS_CMD_FD = 1, + MANA_MP_REQ_CREATE_MR, + MANA_MP_REQ_START_RXTX, + MANA_MP_REQ_STOP_RXTX, +}; + +/* Pameters for IPC. */ +struct mana_mp_param { + enum mana_mp_req_type type; + int port_id; + int result; + + /* MANA_MP_REQ_CREATE_MR */ + uintptr_t addr; + uint32_t len; +}; + +#define MANA_MP_NAME "net_mana_mp" +int mana_mp_init_primary(void); +int mana_mp_init_secondary(void); +void mana_mp_uninit_primary(void); +void mana_mp_uninit_secondary(void); +int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev); + +void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type); + +#endif diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build new file mode 100644 index 0000000000..81c4118f53 --- /dev/null +++ b/drivers/net/mana/meson.build @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2022 Microsoft Corporation + +if not is_linux or not dpdk_conf.has('RTE_ARCH_X86_64') + build = false + reason = 'mana is supported on Linux X86_64' + subdir_done() +endif + +deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs'] + +sources += files( + 'mana.c', + 'mp.c', +) + +libnames = ['ibverbs', 'mana' ] +foreach libname:libnames + lib = cc.find_library(libname, required:false) + if lib.found() + ext_deps += lib + else + build = false + reason = 'missing dependency, "' + libname + '"' + subdir_done() + endif +endforeach + +required_symbols = [ + ['infiniband/manadv.h', 'manadv_set_context_attr'], + ['infiniband/manadv.h', 'manadv_init_obj'], + ['infiniband/manadv.h', 'MANADV_CTX_ATTR_BUF_ALLOCATORS'], + ['infiniband/manadv.h', 'MANADV_OBJ_QP'], + ['infiniband/manadv.h', 'MANADV_OBJ_CQ'], + ['infiniband/manadv.h', 'MANADV_OBJ_RWQ'], +] + +foreach arg:required_symbols + if not cc.has_header_symbol(arg[0], arg[1]) + build = false + reason = 'missing symbol "' + arg[1] + '" in "' + arg[0] + '"' + subdir_done() + endif +endforeach diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c new file mode 100644 index 0000000000..d7580e8a28 --- /dev/null +++ b/drivers/net/mana/mp.c @@ -0,0 +1,235 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2022 Microsoft Corporation + */ + +#include <rte_malloc.h> +#include <ethdev_driver.h> +#include <rte_log.h> + +#include <infiniband/verbs.h> + +#include "mana.h" + +extern struct mana_shared_data *mana_shared_data; + +static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type, + int port_id) +{ + struct mana_mp_param *param; + + strlcpy(msg->name, MANA_MP_NAME, sizeof(msg->name)); + msg->len_param = sizeof(*param); + + param = (struct mana_mp_param *)msg->param; + param->type = type; + param->port_id = port_id; +} + +static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg, + const void *peer) +{ + struct rte_eth_dev *dev; + const struct mana_mp_param *param = + (const struct mana_mp_param *)mp_msg->param; + struct rte_mp_msg mp_res = { 0 }; + struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param; + int ret; + struct mana_priv *priv; + + if (!rte_eth_dev_is_valid_port(param->port_id)) { + DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id); + return -ENODEV; + } + + dev = &rte_eth_devices[param->port_id]; + priv = dev->data->dev_private; + + mp_init_msg(&mp_res, param->type, param->port_id); + + switch (param->type) { + case MANA_MP_REQ_VERBS_CMD_FD: + mp_res.num_fds = 1; + mp_res.fds[0] = priv->ib_ctx->cmd_fd; + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + + default: + DRV_LOG(ERR, "Port %u unknown primary MP type %u", + param->port_id, param->type); + ret = -EINVAL; + } + + return ret; +} + +static int mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, + const void *peer) +{ + struct rte_mp_msg mp_res = { 0 }; + struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param; + const struct mana_mp_param *param = + (const struct mana_mp_param *)mp_msg->param; + struct rte_eth_dev *dev; + int ret; + + if (!rte_eth_dev_is_valid_port(param->port_id)) { + DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id); + return -ENODEV; + } + + dev = &rte_eth_devices[param->port_id]; + + mp_init_msg(&mp_res, param->type, param->port_id); + + switch (param->type) { + case MANA_MP_REQ_START_RXTX: + DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id); + + rte_mb(); + + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + + case MANA_MP_REQ_STOP_RXTX: + DRV_LOG(INFO, "Port %u stopping datapath", dev->data->port_id); + + dev->tx_pkt_burst = mana_tx_burst_removed; + dev->rx_pkt_burst = mana_rx_burst_removed; + + rte_mb(); + + res->result = 0; + ret = rte_mp_reply(&mp_res, peer); + break; + + default: + DRV_LOG(ERR, "Port %u unknown secondary MP type %u", + param->port_id, param->type); + ret = -EINVAL; + } + + return ret; +} + +int mana_mp_init_primary(void) +{ + int ret; + + ret = rte_mp_action_register(MANA_MP_NAME, mana_mp_primary_handle); + if (ret && rte_errno != ENOTSUP) { + DRV_LOG(ERR, "Failed to register primary handler %d %d", + ret, rte_errno); + return -1; + } + + return 0; +} + +void mana_mp_uninit_primary(void) +{ + rte_mp_action_unregister(MANA_MP_NAME); +} + +int mana_mp_init_secondary(void) +{ + return rte_mp_action_register(MANA_MP_NAME, mana_mp_secondary_handle); +} + +void mana_mp_uninit_secondary(void) +{ + rte_mp_action_unregister(MANA_MP_NAME); +} + +int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev) +{ + struct rte_mp_msg mp_req = { 0 }; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mana_mp_param *res; + struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + int ret; + + mp_init_msg(&mp_req, MANA_MP_REQ_VERBS_CMD_FD, dev->data->port_id); + + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + DRV_LOG(ERR, "port %u request to primary process failed", + dev->data->port_id); + return ret; + } + + if (mp_rep.nb_received != 1) { + DRV_LOG(ERR, "primary replied %u messages", mp_rep.nb_received); + ret = -EPROTO; + goto exit; + } + + mp_res = &mp_rep.msgs[0]; + res = (struct mana_mp_param *)mp_res->param; + if (res->result) { + DRV_LOG(ERR, "failed to get CMD FD, port %u", + dev->data->port_id); + ret = res->result; + goto exit; + } + + if (mp_res->num_fds != 1) { + DRV_LOG(ERR, "got FDs %d unexpected", mp_res->num_fds); + ret = -EPROTO; + goto exit; + } + + ret = mp_res->fds[0]; + DRV_LOG(ERR, "port %u command FD from primary is %d", + dev->data->port_id, ret); +exit: + free(mp_rep.msgs); + return ret; +} + +void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type) +{ + struct rte_mp_msg mp_req = { 0 }; + struct rte_mp_msg *mp_res; + struct rte_mp_reply mp_rep; + struct mana_mp_param *res; + struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0}; + int i, ret; + + if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) { + DRV_LOG(ERR, "port %u unknown request (req_type %d)", + dev->data->port_id, type); + return; + } + + if (!mana_shared_data->secondary_cnt) + return; + + mp_init_msg(&mp_req, type, dev->data->port_id); + + ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts); + if (ret) { + if (rte_errno != ENOTSUP) + DRV_LOG(ERR, "port %u failed to request Rx/Tx (%d)", + dev->data->port_id, type); + goto exit; + } + if (mp_rep.nb_sent != mp_rep.nb_received) { + DRV_LOG(ERR, "port %u not all secondaries responded (%d)", + dev->data->port_id, type); + goto exit; + } + for (i = 0; i < mp_rep.nb_received; i++) { + mp_res = &mp_rep.msgs[i]; + res = (struct mana_mp_param *)mp_res->param; + if (res->result) { + DRV_LOG(ERR, "port %u request failed on secondary %d", + dev->data->port_id, i); + goto exit; + } + } +exit: + free(mp_rep.msgs); +} diff --git a/drivers/net/mana/version.map b/drivers/net/mana/version.map new file mode 100644 index 0000000000..c2e0723b4c --- /dev/null +++ b/drivers/net/mana/version.map @@ -0,0 +1,3 @@ +DPDK_22 { + local: *; +}; diff --git a/drivers/net/meson.build b/drivers/net/meson.build index 2355d1cde8..0b111a6ebb 100644 --- a/drivers/net/meson.build +++ b/drivers/net/meson.build @@ -34,6 +34,7 @@ drivers = [ 'ixgbe', 'kni', 'liquidio', + 'mana', 'memif', 'mlx4', 'mlx5', -- 2.17.1