From: Long Li <lon...@microsoft.com>

MANA is a PCI device. It uses IB verbs to access hardware through the kernel
RDMA layer. This patch introduces build environment and basic device probe
functions.

Signed-off-by: Long Li <lon...@microsoft.com>
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb header file

 MAINTAINERS                       |   6 +
 doc/guides/nics/features/mana.ini |  10 +
 doc/guides/nics/index.rst         |   1 +
 doc/guides/nics/mana.rst          |  66 +++
 drivers/net/mana/mana.c           | 704 ++++++++++++++++++++++++++++++
 drivers/net/mana/mana.h           | 210 +++++++++
 drivers/net/mana/meson.build      |  44 ++
 drivers/net/mana/mp.c             | 235 ++++++++++
 drivers/net/mana/version.map      |   3 +
 drivers/net/meson.build           |   1 +
 10 files changed, 1280 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 18d9edaf88..b8bda48a33 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -837,6 +837,12 @@ F: buildtools/options-ibverbs-static.sh
 F: doc/guides/nics/mlx5.rst
 F: doc/guides/nics/features/mlx5.ini
 
+Microsoft mana
+M: Long Li <lon...@microsoft.com>
+F: drivers/net/mana
+F: doc/guides/nics/mana.rst
+F: doc/guides/nics/features/mana.ini
+
 Microsoft vdev_netvsc - EXPERIMENTAL
 M: Matan Azrad <ma...@nvidia.com>
 F: drivers/net/vdev_netvsc/
diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
new file mode 100644
index 0000000000..b92a27374c
--- /dev/null
+++ b/doc/guides/nics/features/mana.ini
@@ -0,0 +1,10 @@
+;
+; Supported features of the 'mana' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Linux                = Y
+Multiprocess aware   = Y
+Usage doc            = Y
+x86-64               = Y
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 1c94caccea..2725d1d9f0 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -41,6 +41,7 @@ Network Interface Controller Drivers
     intel_vf
     kni
     liquidio
+    mana
     memif
     mlx4
     mlx5
diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
new file mode 100644
index 0000000000..40e18fe810
--- /dev/null
+++ b/doc/guides/nics/mana.rst
@@ -0,0 +1,66 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright 2022 Microsoft Corporation
+
+MANA poll mode driver library
+=============================
+
+The MANA poll mode driver library (**librte_net_mana**) implements support
+for Microsoft Azure Network Adapter VF in SR-IOV context.
+
+Features
+--------
+
+Features of the MANA Ethdev PMD are:
+
+Prerequisites
+-------------
+
+This driver relies on external libraries and kernel drivers for resources
+allocations and initialization. The following dependencies are not part of
+DPDK and must be installed separately:
+
+- **libibverbs** (provided by rdma-core package)
+
+  User space verbs framework used by librte_net_mana. This library provides
+  a generic interface between the kernel and low-level user space drivers
+  such as libmana.
+
+  It allows slow and privileged operations (context initialization, hardware
+  resources allocations) to be managed by the kernel and fast operations to
+  never leave user space.
+
+- **libmana** (provided by rdma-core package)
+
+  Low-level user space driver library for Microsoft Azure Network Adapter
+  devices, it is automatically loaded by libibverbs.
+
+- **Kernel modules**
+
+  They provide the kernel-side verbs API and low level device drivers that
+  manage actual hardware initialization and resources sharing with user
+  space processes.
+
+  Unlike most other PMDs, these modules must remain loaded and bound to
+  their devices:
+
+  - mana: Ethernet device driver that provides kernel network interfaces.
+  - mana_ib: InifiniBand device driver.
+  - ib_uverbs: user space driver for verbs (entry point for libibverbs).
+
+Driver compilation and testing
+------------------------------
+
+Refer to the document :ref:`compiling and testing a PMD for a NIC 
<pmd_build_and_test>`
+for details.
+
+Netvsc PMD arguments
+--------------------
+
+The user can specify below argument in devargs.
+
+#.  ``mac``:
+
+    Specify the MAC address for this device. If it is set, the driver
+    probes and loads the NIC with a matching mac address. If it is not
+    set, the driver probes on all the NICs on the PCI device. The default
+    value is not set, meaning all the NICs will be probed and loaded.
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
new file mode 100644
index 0000000000..63ec1f75f0
--- /dev/null
+++ b/drivers/net/mana/mana.c
@@ -0,0 +1,704 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_kvargs.h>
+#include <rte_eal_paging.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include <assert.h>
+
+#include "mana.h"
+
+/* Shared memory between primary/secondary processes, per driver */
+struct mana_shared_data *mana_shared_data;
+const struct rte_memzone *mana_shared_mz;
+static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
+
+struct mana_shared_data mana_local_data;
+
+/* Spinlock for mana_shared_data */
+static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+       int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
+       char name[mkstr_size_##name + 1]; \
+       \
+       memset(name, 0, mkstr_size_##name + 1); \
+       snprintf(name, sizeof(name), "" __VA_ARGS__)
+
+int mana_logtype_driver;
+int mana_logtype_init;
+
+const struct eth_dev_ops mana_dev_ops = {
+};
+
+const struct eth_dev_ops mana_dev_sec_ops = {
+};
+
+uint16_t
+mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
+                     struct rte_mbuf **pkts __rte_unused,
+                     uint16_t pkts_n __rte_unused)
+{
+       rte_mb();
+       return 0;
+}
+
+uint16_t
+mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
+                     struct rte_mbuf **pkts __rte_unused,
+                     uint16_t pkts_n __rte_unused)
+{
+       rte_mb();
+       return 0;
+}
+
+static const char *mana_init_args[] = {
+       "mac",
+       NULL,
+};
+
+/* Support of parsing up to 8 mac address from EAL command line */
+#define MAX_NUM_ADDRESS 8
+struct mana_conf {
+       struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
+       unsigned int index;
+};
+
+static int mana_arg_parse_callback(const char *key, const char *val,
+                                  void *private)
+{
+       struct mana_conf *conf = (struct mana_conf *)private;
+       int ret;
+
+       DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
+
+       if (conf->index >= MAX_NUM_ADDRESS) {
+               DRV_LOG(ERR, "Exceeding max MAC address");
+               return 1;
+       }
+
+       ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
+       if (ret) {
+               DRV_LOG(ERR, "Invalid MAC address %s", val);
+               return ret;
+       }
+
+       conf->index++;
+
+       return 0;
+}
+
+static int mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
+{
+       struct rte_kvargs *kvlist;
+       unsigned int arg_count;
+       int ret = 0;
+
+       kvlist = rte_kvargs_parse(devargs->args, mana_init_args);
+       if (!kvlist) {
+               DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->args);
+               return -EINVAL;
+       }
+
+       arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
+       if (arg_count > MAX_NUM_ADDRESS) {
+               ret = -EINVAL;
+               goto free_kvlist;
+       }
+       ret = rte_kvargs_process(kvlist, mana_init_args[0],
+                                mana_arg_parse_callback, conf);
+       if (ret) {
+               DRV_LOG(ERR, "error parsing args");
+               goto free_kvlist;
+       }
+
+free_kvlist:
+       rte_kvargs_free(kvlist);
+       return ret;
+}
+
+static int get_port_mac(struct ibv_device *device, unsigned int port,
+                       struct rte_ether_addr *addr)
+{
+       FILE *file;
+       int ret = 0;
+       DIR *dir;
+       struct dirent *dent;
+       unsigned int dev_port;
+       char mac[20];
+
+       MKSTR(path, "%s/device/net", device->ibdev_path);
+
+       dir = opendir(path);
+       if (!dir)
+               return -ENOENT;
+
+       while ((dent = readdir(dir))) {
+               char *name = dent->d_name;
+
+               MKSTR(filepath, "%s/%s/dev_port", path, name);
+
+               /* Ignore . and .. */
+               if ((name[0] == '.') &&
+                   ((name[1] == '\0') ||
+                    ((name[1] == '.') && (name[2] == '\0'))))
+                       continue;
+
+               file = fopen(filepath, "rb");
+               if (!file)
+                       continue;
+
+               ret = fscanf(file, "%u", &dev_port);
+               fclose(file);
+
+               if (ret != 1)
+                       continue;
+
+               /* Ethernet ports start at 0, IB port start at 1 */
+               if (dev_port == port - 1) {
+                       MKSTR(filepath, "%s/%s/address", path, name);
+
+                       file = fopen(filepath, "rb");
+                       if (!file)
+                               continue;
+
+                       ret = fscanf(file, "%s", mac);
+                       fclose(file);
+
+                       if (ret < 0)
+                               break;
+
+                       ret = rte_ether_unformat_addr(mac, addr);
+                       if (ret)
+                               DRV_LOG(ERR, "unrecognized mac addr %s", mac);
+                       break;
+               }
+       }
+
+       closedir(dir);
+       return ret;
+}
+
+static int mana_ibv_device_to_pci_addr(const struct ibv_device *device,
+                                      struct rte_pci_addr *pci_addr)
+{
+       FILE *file;
+       char *line = NULL;
+       size_t len = 0;
+
+       MKSTR(path, "%s/device/uevent", device->ibdev_path);
+
+       file = fopen(path, "rb");
+       if (!file)
+               return -errno;
+
+       while (getline(&line, &len, file) != -1) {
+               /* Extract information. */
+               if (sscanf(line,
+                          "PCI_SLOT_NAME="
+                          "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
+                          &pci_addr->domain,
+                          &pci_addr->bus,
+                          &pci_addr->devid,
+                          &pci_addr->function) == 4) {
+                       break;
+               }
+       }
+
+       free(line);
+       fclose(file);
+       return 0;
+}
+
+static int mana_proc_priv_init(struct rte_eth_dev *dev)
+{
+       struct mana_process_priv *priv;
+
+       priv = rte_zmalloc_socket("mana_proc_priv",
+                                 sizeof(struct mana_process_priv),
+                                 RTE_CACHE_LINE_SIZE,
+                                 dev->device->numa_node);
+       if (!priv)
+               return -ENOMEM;
+
+       dev->process_private = priv;
+       return 0;
+}
+
+static int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
+{
+       struct mana_process_priv *priv = eth_dev->process_private;
+
+       void *addr;
+
+       addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
+       if (addr == MAP_FAILED) {
+               DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
+                       eth_dev->data->port_id);
+               return -ENOMEM;
+       }
+
+       DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
+
+       priv->db_page = addr;
+
+       return 0;
+}
+
+/* Initialize shared data for the driver (all devices) */
+static int mana_init_shared_data(void)
+{
+       int ret =  0;
+       const struct rte_memzone *secondary_mz;
+
+       rte_spinlock_lock(&mana_shared_data_lock);
+
+       /* Skip if shared data is already initialized */
+       if (mana_shared_data)
+               goto exit;
+
+       if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+               mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
+                                                    sizeof(*mana_shared_data),
+                                                    SOCKET_ID_ANY, 0);
+               if (!mana_shared_mz) {
+                       DRV_LOG(ERR, "Cannot allocate mana shared data");
+                       ret = -rte_errno;
+                       goto exit;
+               }
+
+               mana_shared_data = mana_shared_mz->addr;
+               memset(mana_shared_data, 0, sizeof(*mana_shared_data));
+               rte_spinlock_init(&mana_shared_data->lock);
+       } else {
+               secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
+               if (!secondary_mz) {
+                       DRV_LOG(ERR, "Cannot attach mana shared data");
+                       ret = -rte_errno;
+                       goto exit;
+               }
+
+               mana_shared_data = secondary_mz->addr;
+               memset(&mana_local_data, 0, sizeof(mana_local_data));
+       }
+
+exit:
+       rte_spinlock_unlock(&mana_shared_data_lock);
+
+       return ret;
+}
+
+static int mana_init_once(void)
+{
+       int ret;
+
+       ret = mana_init_shared_data();
+       if (ret)
+               return ret;
+
+       rte_spinlock_lock(&mana_shared_data->lock);
+
+       switch (rte_eal_process_type()) {
+       case RTE_PROC_PRIMARY:
+               if (mana_shared_data->init_done)
+                       break;
+
+               ret = mana_mp_init_primary();
+               if (ret)
+                       break;
+               DRV_LOG(ERR, "MP INIT PRIMARY");
+
+               mana_shared_data->init_done = 1;
+               break;
+
+       case RTE_PROC_SECONDARY:
+
+               if (mana_local_data.init_done)
+                       break;
+
+               ret = mana_mp_init_secondary();
+               if (ret)
+                       break;
+
+               DRV_LOG(ERR, "MP INIT SECONDARY");
+
+               mana_local_data.init_done = 1;
+               break;
+
+       default:
+               /* Impossible, internal error */
+               ret = -EPROTO;
+               break;
+       }
+
+       rte_spinlock_unlock(&mana_shared_data->lock);
+
+       return ret;
+}
+
+static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
+                             struct rte_pci_device *pci_dev,
+                             struct rte_ether_addr *mac_addr)
+{
+       struct ibv_device **ibv_list;
+       int ibv_idx;
+       struct ibv_context *ctx;
+       struct ibv_device_attr_ex dev_attr;
+       int num_devices;
+       int ret = 0;
+       uint8_t port;
+       struct mana_priv *priv = NULL;
+       struct rte_eth_dev *eth_dev = NULL;
+       bool found_port;
+
+       ibv_list = ibv_get_device_list(&num_devices);
+       for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
+               struct ibv_device *ibdev = ibv_list[ibv_idx];
+               struct rte_pci_addr pci_addr;
+
+               DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
+                       ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
+
+               if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
+                       continue;
+
+               /* Ignore if this IB device is not this PCI device */
+               if (pci_dev->addr.domain != pci_addr.domain ||
+                   pci_dev->addr.bus != pci_addr.bus ||
+                   pci_dev->addr.devid != pci_addr.devid ||
+                   pci_dev->addr.function != pci_addr.function)
+                       continue;
+
+               ctx = ibv_open_device(ibdev);
+               if (!ctx) {
+                       DRV_LOG(ERR, "Failed to open IB device %s",
+                               ibdev->name);
+                       continue;
+               }
+
+               ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
+               DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
+                       dev_attr.orig_attr.phys_port_cnt);
+               found_port = false;
+
+               for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
+                    port++) {
+                       struct ibv_parent_domain_init_attr attr = {};
+                       struct rte_ether_addr addr;
+                       char address[64];
+                       char name[RTE_ETH_NAME_MAX_LEN];
+
+                       ret = get_port_mac(ibdev, port, &addr);
+                       if (ret)
+                               continue;
+
+                       if (mac_addr && !rte_is_same_ether_addr(&addr, 
mac_addr))
+                               continue;
+
+                       rte_ether_format_addr(address, sizeof(address), &addr);
+                       DRV_LOG(INFO, "device located port %u address %s",
+                               port, address);
+                       found_port = true;
+
+                       priv = rte_zmalloc_socket(NULL, sizeof(*priv),
+                                                 RTE_CACHE_LINE_SIZE,
+                                                 SOCKET_ID_ANY);
+                       if (!priv) {
+                               ret = -ENOMEM;
+                               goto failed;
+                       }
+
+                       snprintf(name, sizeof(name), "%s_port%d",
+                                pci_dev->device.name, port);
+
+                       if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+                               int fd;
+
+                               eth_dev = rte_eth_dev_attach_secondary(name);
+                               if (!eth_dev) {
+                                       DRV_LOG(ERR, "Can't attach to dev %s",
+                                               name);
+                                       ret = -ENOMEM;
+                                       goto failed;
+                               }
+
+                               eth_dev->device = &pci_dev->device;
+                               eth_dev->dev_ops = &mana_dev_sec_ops;
+                               ret = mana_proc_priv_init(eth_dev);
+                               if (ret)
+                                       goto failed;
+                               priv->process_priv = eth_dev->process_private;
+
+                               /* Get the IB FD from the primary process */
+                               fd = mana_mp_req_verbs_cmd_fd(eth_dev);
+                               if (fd < 0) {
+                                       DRV_LOG(ERR, "Failed to get FD %d", fd);
+                                       ret = -ENODEV;
+                                       goto failed;
+                               }
+
+                               ret = mana_map_doorbell_secondary(eth_dev, fd);
+                               if (ret) {
+                                       DRV_LOG(ERR, "Failed secondary map %d",
+                                               fd);
+                                       goto failed;
+                               }
+
+                               /* fd is no not used after mapping doorbell */
+                               close(fd);
+
+                               rte_spinlock_lock(&mana_shared_data->lock);
+                               mana_shared_data->secondary_cnt++;
+                               mana_local_data.secondary_cnt++;
+                               rte_spinlock_unlock(&mana_shared_data->lock);
+
+                               rte_eth_copy_pci_info(eth_dev, pci_dev);
+                               rte_eth_dev_probing_finish(eth_dev);
+
+                               /* Impossible to have more than one port
+                                * matching a MAC address
+                                */
+                               continue;
+                       }
+
+                       eth_dev = rte_eth_dev_allocate(name);
+                       if (!eth_dev) {
+                               ret = -ENOMEM;
+                               goto failed;
+                       }
+
+                       eth_dev->data->mac_addrs =
+                               rte_calloc("mana_mac", 1,
+                                          sizeof(struct rte_ether_addr), 0);
+                       if (!eth_dev->data->mac_addrs) {
+                               ret = -ENOMEM;
+                               goto failed;
+                       }
+
+                       rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs);
+
+                       priv->ib_pd = ibv_alloc_pd(ctx);
+                       if (!priv->ib_pd) {
+                               DRV_LOG(ERR, "ibv_alloc_pd failed port %d", 
port);
+                               ret = -ENOMEM;
+                               goto failed;
+                       }
+
+                       /* Create a parent domain with the port number */
+                       attr.pd = priv->ib_pd;
+                       attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
+                       attr.pd_context = (void *)(uint64_t)port;
+                       priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, 
&attr);
+                       if (!priv->ib_parent_pd) {
+                               DRV_LOG(ERR,
+                                       "ibv_alloc_parent_domain failed port 
%d",
+                                       port);
+                               ret = -ENOMEM;
+                               goto failed;
+                       }
+
+                       priv->ib_ctx = ctx;
+                       priv->port_id = eth_dev->data->port_id;
+                       priv->dev_port = port;
+                       eth_dev->data->dev_private = priv;
+                       priv->dev_data = eth_dev->data;
+
+                       priv->max_rx_queues = dev_attr.orig_attr.max_qp;
+                       priv->max_tx_queues = dev_attr.orig_attr.max_qp;
+
+                       priv->max_rx_desc =
+                               RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+                                       dev_attr.orig_attr.max_cqe);
+                       priv->max_tx_desc =
+                               RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+                                       dev_attr.orig_attr.max_cqe);
+
+                       priv->max_send_sge = dev_attr.orig_attr.max_sge;
+                       priv->max_recv_sge = dev_attr.orig_attr.max_sge;
+
+                       priv->max_mr = dev_attr.orig_attr.max_mr;
+                       priv->max_mr_size = dev_attr.orig_attr.max_mr_size;
+
+                       DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d\n",
+                               name, priv->max_rx_queues, priv->max_rx_desc,
+                               priv->max_send_sge);
+
+                       rte_spinlock_lock(&mana_shared_data->lock);
+                       mana_shared_data->primary_cnt++;
+                       rte_spinlock_unlock(&mana_shared_data->lock);
+
+                       eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
+
+                       eth_dev->device = &pci_dev->device;
+                       eth_dev->data->dev_flags |=
+                               RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
+
+                       DRV_LOG(INFO, "device %s at port %u",
+                               name, eth_dev->data->port_id);
+
+                       eth_dev->rx_pkt_burst = mana_rx_burst_removed;
+                       eth_dev->tx_pkt_burst = mana_tx_burst_removed;
+                       eth_dev->dev_ops = &mana_dev_ops;
+
+                       rte_eth_copy_pci_info(eth_dev, pci_dev);
+                       rte_eth_dev_probing_finish(eth_dev);
+               }
+
+               /* Secondary process doesn't need an ibv_ctx. It maps the
+                * doorbell pages using the IB cmd_fd passed from the primary
+                * process and send messages to primary process for memory
+                * registartions.
+                */
+               if (!found_port || rte_eal_process_type() == RTE_PROC_SECONDARY)
+                       ibv_close_device(ctx);
+       }
+
+       ibv_free_device_list(ibv_list);
+       return 0;
+
+failed:
+       /* Free the resource for the port failed */
+       if (priv) {
+               if (priv->ib_parent_pd)
+                       ibv_dealloc_pd(priv->ib_parent_pd);
+
+               if (priv->ib_pd)
+                       ibv_dealloc_pd(priv->ib_pd);
+       }
+
+       if (eth_dev)
+               rte_eth_dev_release_port(eth_dev);
+
+       rte_free(priv);
+
+       ibv_close_device(ctx);
+       ibv_free_device_list(ibv_list);
+
+       return ret;
+}
+
+static int mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+                         struct rte_pci_device *pci_dev)
+{
+       struct rte_devargs *args = pci_dev->device.devargs;
+       struct mana_conf conf = {};
+       unsigned int i;
+       int ret;
+
+       if (args && args->args) {
+               ret = mana_parse_args(args, &conf);
+               if (ret) {
+                       DRV_LOG(ERR, "failed to parse parameters args = %s",
+                               args->args);
+                       return ret;
+               }
+       }
+
+       ret = mana_init_once();
+       if (ret) {
+               DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
+               return ret;
+       }
+
+       /* If there are no driver parameters, probe on all ports */
+       if (!conf.index)
+               return mana_pci_probe_mac(pci_drv, pci_dev, NULL);
+
+       for (i = 0; i < conf.index; i++) {
+               ret = mana_pci_probe_mac(pci_drv, pci_dev, &conf.mac_array[i]);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int mana_dev_uninit(struct rte_eth_dev *dev)
+{
+       RTE_SET_USED(dev);
+       return 0;
+}
+
+static int mana_pci_remove(struct rte_pci_device *pci_dev)
+{
+       if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+               rte_spinlock_lock(&mana_shared_data_lock);
+
+               rte_spinlock_lock(&mana_shared_data->lock);
+
+               RTE_VERIFY(mana_shared_data->primary_cnt > 0);
+               mana_shared_data->primary_cnt--;
+               if (!mana_shared_data->primary_cnt) {
+                       DRV_LOG(DEBUG, "mp uninit primary");
+                       mana_mp_uninit_primary();
+               }
+
+               rte_spinlock_unlock(&mana_shared_data->lock);
+
+               /* Also free the shared memory if this is the last */
+               if (!mana_shared_data->primary_cnt) {
+                       DRV_LOG(DEBUG, "free shared memezone data");
+                       rte_memzone_free(mana_shared_mz);
+               }
+
+               rte_spinlock_unlock(&mana_shared_data_lock);
+       } else {
+               rte_spinlock_lock(&mana_shared_data_lock);
+
+               rte_spinlock_lock(&mana_shared_data->lock);
+               RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
+               mana_shared_data->secondary_cnt--;
+               rte_spinlock_unlock(&mana_shared_data->lock);
+
+               RTE_VERIFY(mana_local_data.secondary_cnt > 0);
+               mana_local_data.secondary_cnt--;
+               if (!mana_local_data.secondary_cnt) {
+                       DRV_LOG(DEBUG, "mp uninit secondary");
+                       mana_mp_uninit_secondary();
+               }
+
+               rte_spinlock_unlock(&mana_shared_data_lock);
+       }
+
+       return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
+}
+
+static const struct rte_pci_id mana_pci_id_map[] = {
+       {
+               RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
+                              PCI_DEVICE_ID_MICROSOFT_MANA)
+       },
+};
+
+static struct rte_pci_driver mana_pci_driver = {
+       .driver = {
+               .name = "mana_pci",
+       },
+       .id_table = mana_pci_id_map,
+       .probe = mana_pci_probe,
+       .remove = mana_pci_remove,
+       .drv_flags = RTE_PCI_DRV_INTR_RMV,
+};
+
+RTE_INIT(rte_mana_pmd_init)
+{
+       rte_pci_register(&mana_pci_driver);
+}
+
+RTE_PMD_EXPORT_NAME(net_mana, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
new file mode 100644
index 0000000000..e30c030b4e
--- /dev/null
+++ b/drivers/net/mana/mana.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#ifndef __MANA_H__
+#define __MANA_H__
+
+enum {
+       PCI_VENDOR_ID_MICROSOFT = 0x1414,
+};
+
+enum {
+       PCI_DEVICE_ID_MICROSOFT_MANA = 0x00ba,
+};
+
+/* Shared data between primary/secondary processes */
+struct mana_shared_data {
+       rte_spinlock_t lock;
+       int init_done;
+       unsigned int primary_cnt;
+       unsigned int secondary_cnt;
+};
+
+#define MIN_RX_BUF_SIZE        1024
+#define MAX_FRAME_SIZE RTE_ETHER_MAX_LEN
+#define BNIC_MAX_MAC_ADDR 1
+
+#define BNIC_DEV_RX_OFFLOAD_SUPPORT ( \
+               DEV_RX_OFFLOAD_CHECKSUM | \
+               DEV_RX_OFFLOAD_RSS_HASH)
+
+#define BNIC_DEV_TX_OFFLOAD_SUPPORT ( \
+               RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \
+               RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \
+               RTE_ETH_TX_OFFLOAD_TCP_CKSUM | \
+               RTE_ETH_TX_OFFLOAD_UDP_CKSUM | \
+               RTE_ETH_TX_OFFLOAD_TCP_TSO)
+
+#define INDIRECTION_TABLE_NUM_ELEMENTS 64
+#define TOEPLITZ_HASH_KEY_SIZE_IN_BYTES 40
+#define BNIC_ETH_RSS_SUPPORT ( \
+       ETH_RSS_IPV4 |       \
+       ETH_RSS_NONFRAG_IPV4_TCP | \
+       ETH_RSS_NONFRAG_IPV4_UDP | \
+       ETH_RSS_IPV6 |       \
+       ETH_RSS_NONFRAG_IPV6_TCP | \
+       ETH_RSS_NONFRAG_IPV6_UDP)
+
+#define MIN_BUFFERS_PER_QUEUE          64
+#define MAX_RECEIVE_BUFFERS_PER_QUEUE  256
+#define MAX_SEND_BUFFERS_PER_QUEUE     256
+
+struct mana_process_priv {
+       void *db_page;
+};
+
+struct mana_priv {
+       struct rte_eth_dev_data *dev_data;
+       struct mana_process_priv *process_priv;
+       int num_queues;
+
+       /* DPDK port */
+       uint16_t port_id;
+
+       /* IB device port */
+       uint8_t dev_port;
+
+       struct ibv_context *ib_ctx;
+       struct ibv_pd *ib_pd;
+       struct ibv_pd *ib_parent_pd;
+       struct ibv_rwq_ind_table *ind_table;
+       uint8_t ind_table_key[40];
+       struct ibv_qp *rwq_qp;
+       void *db_page;
+       int max_rx_queues;
+       int max_tx_queues;
+       int max_rx_desc;
+       int max_tx_desc;
+       int max_send_sge;
+       int max_recv_sge;
+       int max_mr;
+       uint64_t max_mr_size;
+};
+
+struct mana_txq_desc {
+       struct rte_mbuf *pkt;
+       uint32_t wqe_size_in_bu;
+};
+
+struct mana_rxq_desc {
+       struct rte_mbuf *pkt;
+       uint32_t wqe_size_in_bu;
+};
+
+struct mana_gdma_queue {
+       void *buffer;
+       uint32_t count; /* in entries */
+       uint32_t size;  /* in bytes */
+       uint32_t id;
+       uint32_t head;
+       uint32_t tail;
+};
+
+struct mana_stats {
+       uint64_t packets;
+       uint64_t bytes;
+       uint64_t errors;
+       uint64_t nombuf;
+};
+
+#define MANA_MR_BTREE_PER_QUEUE_N      64
+struct mana_txq {
+       struct mana_priv *priv;
+       uint32_t num_desc;
+       struct ibv_cq *cq;
+       struct ibv_qp *qp;
+
+       struct mana_gdma_queue gdma_sq;
+       struct mana_gdma_queue gdma_cq;
+
+       uint32_t tx_vp_offset;
+
+       /* For storing pending requests */
+       struct mana_txq_desc *desc_ring;
+
+       /* desc_ring_head is where we put pending requests to ring,
+        * completion pull off desc_ring_tail
+        */
+       uint32_t desc_ring_head, desc_ring_tail;
+
+       struct mana_stats stats;
+       unsigned int socket;
+};
+
+struct mana_rxq {
+       struct mana_priv *priv;
+       uint32_t num_desc;
+       struct rte_mempool *mp;
+       struct ibv_cq *cq;
+       struct ibv_wq *wq;
+
+       /* For storing pending requests */
+       struct mana_rxq_desc *desc_ring;
+
+       /* desc_ring_head is where we put pending requests to ring,
+        * completion pull off desc_ring_tail
+        */
+       uint32_t desc_ring_head, desc_ring_tail;
+
+       struct mana_gdma_queue gdma_rq;
+       struct mana_gdma_queue gdma_cq;
+
+       struct mana_stats stats;
+
+       unsigned int socket;
+};
+
+extern int mana_logtype_driver;
+extern int mana_logtype_init;
+
+#define DRV_LOG(level, fmt, args...) \
+       rte_log(RTE_LOG_ ## level, mana_logtype_driver, "%s(): " fmt "\n", \
+               __func__, ## args)
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+       rte_log(RTE_LOG_ ## level, mana_logtype_init, "%s(): " fmt "\n",\
+               __func__, ## args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
+
+uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+                              uint16_t pkts_n);
+
+uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+                              uint16_t pkts_n);
+
+/** Request timeout for IPC. */
+#define MANA_MP_REQ_TIMEOUT_SEC 5
+
+/* Request types for IPC. */
+enum mana_mp_req_type {
+       MANA_MP_REQ_VERBS_CMD_FD = 1,
+       MANA_MP_REQ_CREATE_MR,
+       MANA_MP_REQ_START_RXTX,
+       MANA_MP_REQ_STOP_RXTX,
+};
+
+/* Pameters for IPC. */
+struct mana_mp_param {
+       enum mana_mp_req_type type;
+       int port_id;
+       int result;
+
+       /* MANA_MP_REQ_CREATE_MR */
+       uintptr_t addr;
+       uint32_t len;
+};
+
+#define MANA_MP_NAME   "net_mana_mp"
+int mana_mp_init_primary(void);
+int mana_mp_init_secondary(void);
+void mana_mp_uninit_primary(void);
+void mana_mp_uninit_secondary(void);
+int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+
+void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
+
+#endif
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
new file mode 100644
index 0000000000..81c4118f53
--- /dev/null
+++ b/drivers/net/mana/meson.build
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2022 Microsoft Corporation
+
+if not is_linux or not dpdk_conf.has('RTE_ARCH_X86_64')
+    build = false
+    reason = 'mana is supported on Linux X86_64'
+    subdir_done()
+endif
+
+deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
+
+sources += files(
+       'mana.c',
+       'mp.c',
+)
+
+libnames = ['ibverbs', 'mana' ]
+foreach libname:libnames
+    lib = cc.find_library(libname, required:false)
+    if lib.found()
+        ext_deps += lib
+    else
+        build = false
+        reason = 'missing dependency, "' + libname + '"'
+        subdir_done()
+    endif
+endforeach
+
+required_symbols = [
+    ['infiniband/manadv.h', 'manadv_set_context_attr'],
+    ['infiniband/manadv.h', 'manadv_init_obj'],
+    ['infiniband/manadv.h', 'MANADV_CTX_ATTR_BUF_ALLOCATORS'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_QP'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_CQ'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_RWQ'],
+]
+
+foreach arg:required_symbols
+    if not cc.has_header_symbol(arg[0], arg[1])
+        build = false
+        reason = 'missing symbol "' + arg[1] + '" in "' + arg[0] + '"'
+        subdir_done()
+    endif
+endforeach
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
new file mode 100644
index 0000000000..d7580e8a28
--- /dev/null
+++ b/drivers/net/mana/mp.c
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_log.h>
+
+#include <infiniband/verbs.h>
+
+#include "mana.h"
+
+extern struct mana_shared_data *mana_shared_data;
+
+static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type,
+                       int port_id)
+{
+       struct mana_mp_param *param;
+
+       strlcpy(msg->name, MANA_MP_NAME, sizeof(msg->name));
+       msg->len_param = sizeof(*param);
+
+       param = (struct mana_mp_param *)msg->param;
+       param->type = type;
+       param->port_id = port_id;
+}
+
+static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
+                                 const void *peer)
+{
+       struct rte_eth_dev *dev;
+       const struct mana_mp_param *param =
+               (const struct mana_mp_param *)mp_msg->param;
+       struct rte_mp_msg mp_res = { 0 };
+       struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
+       int ret;
+       struct mana_priv *priv;
+
+       if (!rte_eth_dev_is_valid_port(param->port_id)) {
+               DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
+               return -ENODEV;
+       }
+
+       dev = &rte_eth_devices[param->port_id];
+       priv = dev->data->dev_private;
+
+       mp_init_msg(&mp_res, param->type, param->port_id);
+
+       switch (param->type) {
+       case MANA_MP_REQ_VERBS_CMD_FD:
+               mp_res.num_fds = 1;
+               mp_res.fds[0] = priv->ib_ctx->cmd_fd;
+               res->result = 0;
+               ret = rte_mp_reply(&mp_res, peer);
+               break;
+
+       default:
+               DRV_LOG(ERR, "Port %u unknown primary MP type %u",
+                       param->port_id, param->type);
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static int mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg,
+                                   const void *peer)
+{
+       struct rte_mp_msg mp_res = { 0 };
+       struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
+       const struct mana_mp_param *param =
+               (const struct mana_mp_param *)mp_msg->param;
+       struct rte_eth_dev *dev;
+       int ret;
+
+       if (!rte_eth_dev_is_valid_port(param->port_id)) {
+               DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
+               return -ENODEV;
+       }
+
+       dev = &rte_eth_devices[param->port_id];
+
+       mp_init_msg(&mp_res, param->type, param->port_id);
+
+       switch (param->type) {
+       case MANA_MP_REQ_START_RXTX:
+               DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
+
+               rte_mb();
+
+               res->result = 0;
+               ret = rte_mp_reply(&mp_res, peer);
+               break;
+
+       case MANA_MP_REQ_STOP_RXTX:
+               DRV_LOG(INFO, "Port %u stopping datapath", dev->data->port_id);
+
+               dev->tx_pkt_burst = mana_tx_burst_removed;
+               dev->rx_pkt_burst = mana_rx_burst_removed;
+
+               rte_mb();
+
+               res->result = 0;
+               ret = rte_mp_reply(&mp_res, peer);
+               break;
+
+       default:
+               DRV_LOG(ERR, "Port %u unknown secondary MP type %u",
+                       param->port_id, param->type);
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+int mana_mp_init_primary(void)
+{
+       int ret;
+
+       ret = rte_mp_action_register(MANA_MP_NAME, mana_mp_primary_handle);
+       if (ret && rte_errno != ENOTSUP) {
+               DRV_LOG(ERR, "Failed to register primary handler %d %d",
+                       ret, rte_errno);
+               return -1;
+       }
+
+       return 0;
+}
+
+void mana_mp_uninit_primary(void)
+{
+       rte_mp_action_unregister(MANA_MP_NAME);
+}
+
+int mana_mp_init_secondary(void)
+{
+       return rte_mp_action_register(MANA_MP_NAME, mana_mp_secondary_handle);
+}
+
+void mana_mp_uninit_secondary(void)
+{
+       rte_mp_action_unregister(MANA_MP_NAME);
+}
+
+int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
+{
+       struct rte_mp_msg mp_req = { 0 };
+       struct rte_mp_msg *mp_res;
+       struct rte_mp_reply mp_rep;
+       struct mana_mp_param *res;
+       struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+       int ret;
+
+       mp_init_msg(&mp_req, MANA_MP_REQ_VERBS_CMD_FD, dev->data->port_id);
+
+       ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+       if (ret) {
+               DRV_LOG(ERR, "port %u request to primary process failed",
+                       dev->data->port_id);
+               return ret;
+       }
+
+       if (mp_rep.nb_received != 1) {
+               DRV_LOG(ERR, "primary replied %u messages", mp_rep.nb_received);
+               ret = -EPROTO;
+               goto exit;
+       }
+
+       mp_res = &mp_rep.msgs[0];
+       res = (struct mana_mp_param *)mp_res->param;
+       if (res->result) {
+               DRV_LOG(ERR, "failed to get CMD FD, port %u",
+                       dev->data->port_id);
+               ret = res->result;
+               goto exit;
+       }
+
+       if (mp_res->num_fds != 1) {
+               DRV_LOG(ERR, "got FDs %d unexpected", mp_res->num_fds);
+               ret = -EPROTO;
+               goto exit;
+       }
+
+       ret = mp_res->fds[0];
+       DRV_LOG(ERR, "port %u command FD from primary is %d",
+               dev->data->port_id, ret);
+exit:
+       free(mp_rep.msgs);
+       return ret;
+}
+
+void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
+{
+       struct rte_mp_msg mp_req = { 0 };
+       struct rte_mp_msg *mp_res;
+       struct rte_mp_reply mp_rep;
+       struct mana_mp_param *res;
+       struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+       int i, ret;
+
+       if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) {
+               DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+                       dev->data->port_id, type);
+               return;
+       }
+
+       if (!mana_shared_data->secondary_cnt)
+               return;
+
+       mp_init_msg(&mp_req, type, dev->data->port_id);
+
+       ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+       if (ret) {
+               if (rte_errno != ENOTSUP)
+                       DRV_LOG(ERR, "port %u failed to request Rx/Tx (%d)",
+                               dev->data->port_id, type);
+               goto exit;
+       }
+       if (mp_rep.nb_sent != mp_rep.nb_received) {
+               DRV_LOG(ERR, "port %u not all secondaries responded (%d)",
+                       dev->data->port_id, type);
+               goto exit;
+       }
+       for (i = 0; i < mp_rep.nb_received; i++) {
+               mp_res = &mp_rep.msgs[i];
+               res = (struct mana_mp_param *)mp_res->param;
+               if (res->result) {
+                       DRV_LOG(ERR, "port %u request failed on secondary %d",
+                               dev->data->port_id, i);
+                       goto exit;
+               }
+       }
+exit:
+       free(mp_rep.msgs);
+}
diff --git a/drivers/net/mana/version.map b/drivers/net/mana/version.map
new file mode 100644
index 0000000000..c2e0723b4c
--- /dev/null
+++ b/drivers/net/mana/version.map
@@ -0,0 +1,3 @@
+DPDK_22 {
+       local: *;
+};
diff --git a/drivers/net/meson.build b/drivers/net/meson.build
index 2355d1cde8..0b111a6ebb 100644
--- a/drivers/net/meson.build
+++ b/drivers/net/meson.build
@@ -34,6 +34,7 @@ drivers = [
         'ixgbe',
         'kni',
         'liquidio',
+        'mana',
         'memif',
         'mlx4',
         'mlx5',
-- 
2.17.1

Reply via email to