Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
environment and doc
在 2022/8/31 6:51, lon...@linuxonhyperv.com 写道:
From: Long Li <lon...@microsoft.com>
MANA is a PCI device. It uses IB verbs to access hardware through
the kernel RDMA layer. This patch introduces build environment and
basic device probe functions.
Signed-off-by: Long Li <lon...@microsoft.com>
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb
header file
v4:
Remove extra "\n" in logging code.
Use "r" in place of "rb" in fopen() to read text files.
[snip]
+
+static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
__rte_unused,
+ struct rte_pci_device *pci_dev,
+ struct rte_ether_addr *mac_addr) {
+ struct ibv_device **ibv_list;
+ int ibv_idx;
+ struct ibv_context *ctx;
+ struct ibv_device_attr_ex dev_attr;
+ int num_devices;
+ int ret = 0;
+ uint8_t port;
+ struct mana_priv *priv = NULL;
+ struct rte_eth_dev *eth_dev = NULL;
+ bool found_port;
+
+ ibv_list = ibv_get_device_list(&num_devices);
+ for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
+ struct ibv_device *ibdev = ibv_list[ibv_idx];
+ struct rte_pci_addr pci_addr;
+
+ DRV_LOG(INFO, "Probe device name %s dev_name %s
ibdev_path %s",
+ ibdev->name, ibdev->dev_name, ibdev-
ibdev_path);
+
+ if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
+ continue;
+
+ /* Ignore if this IB device is not this PCI device */
+ if (pci_dev->addr.domain != pci_addr.domain ||
+ pci_dev->addr.bus != pci_addr.bus ||
+ pci_dev->addr.devid != pci_addr.devid ||
+ pci_dev->addr.function != pci_addr.function)
+ continue;
+
+ ctx = ibv_open_device(ibdev);
+ if (!ctx) {
+ DRV_LOG(ERR, "Failed to open IB device %s",
+ ibdev->name);
+ continue;
+ }
+
+ ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
+ DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
+ dev_attr.orig_attr.phys_port_cnt);
+ found_port = false;
+
+ for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
+ port++) {
+ struct ibv_parent_domain_init_attr attr = {};
+ struct rte_ether_addr addr;
+ char address[64];
+ char name[RTE_ETH_NAME_MAX_LEN];
+
+ ret = get_port_mac(ibdev, port, &addr);
+ if (ret)
+ continue;
+
+ if (mac_addr && !rte_is_same_ether_addr(&addr,
mac_addr))
+ continue;
+
+ rte_ether_format_addr(address, sizeof(address),
&addr);
+ DRV_LOG(INFO, "device located port %u address
%s",
+ port, address);
+ found_port = true;
+
+ priv = rte_zmalloc_socket(NULL, sizeof(*priv),
+ RTE_CACHE_LINE_SIZE,
+ SOCKET_ID_ANY);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ snprintf(name, sizeof(name), "%s_port%d",
+ pci_dev->device.name, port);
+
+ if (rte_eal_process_type() ==
RTE_PROC_SECONDARY) {
+ int fd;
+
+ eth_dev =
rte_eth_dev_attach_secondary(name);
+ if (!eth_dev) {
+ DRV_LOG(ERR, "Can't attach to dev
%s",
+ name);
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ eth_dev->device = &pci_dev->device;
+ eth_dev->dev_ops = &mana_dev_sec_ops;
+ ret = mana_proc_priv_init(eth_dev);
+ if (ret)
+ goto failed;
+ priv->process_priv = eth_dev-
process_private;
+
+ /* Get the IB FD from the primary
process */
+ fd =
mana_mp_req_verbs_cmd_fd(eth_dev);
+ if (fd < 0) {
+ DRV_LOG(ERR, "Failed to get FD %d",
fd);
+ ret = -ENODEV;
+ goto failed;
+ }
+
+ ret =
mana_map_doorbell_secondary(eth_dev, fd);
+ if (ret) {
+ DRV_LOG(ERR, "Failed secondary
map %d",
+ fd);
+ goto failed;
+ }
+
+ /* fd is no not used after mapping
doorbell */
+ close(fd);
+
+ rte_spinlock_lock(&mana_shared_data-
lock);
+ mana_shared_data->secondary_cnt++;
+ mana_local_data.secondary_cnt++;
+ rte_spinlock_unlock(&mana_shared_data-
lock);
+
+ rte_eth_copy_pci_info(eth_dev, pci_dev);
+ rte_eth_dev_probing_finish(eth_dev);
+
+ /* Impossible to have more than one port
+ * matching a MAC address
+ */
+ continue;
+ }
+
+ eth_dev = rte_eth_dev_allocate(name);
+ if (!eth_dev) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ eth_dev->data->mac_addrs =
+ rte_calloc("mana_mac", 1,
+ sizeof(struct
rte_ether_addr), 0);
+ if (!eth_dev->data->mac_addrs) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ rte_ether_addr_copy(&addr, eth_dev->data-
mac_addrs);
+
+ priv->ib_pd = ibv_alloc_pd(ctx);
+ if (!priv->ib_pd) {
+ DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
port);
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ /* Create a parent domain with the port number */
+ attr.pd = priv->ib_pd;
+ attr.comp_mask =
IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
+ attr.pd_context = (void *)(uint64_t)port;
+ priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
&attr);
+ if (!priv->ib_parent_pd) {
+ DRV_LOG(ERR,
+ "ibv_alloc_parent_domain failed
port
%d",
+ port);
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ priv->ib_ctx = ctx;
+ priv->port_id = eth_dev->data->port_id;
+ priv->dev_port = port;
+ eth_dev->data->dev_private = priv;
+ priv->dev_data = eth_dev->data;
+
+ priv->max_rx_queues = dev_attr.orig_attr.max_qp;
+ priv->max_tx_queues = dev_attr.orig_attr.max_qp;
+
+ priv->max_rx_desc =
+ RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+ dev_attr.orig_attr.max_cqe);
+ priv->max_tx_desc =
+ RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+ dev_attr.orig_attr.max_cqe);
+
+ priv->max_send_sge = dev_attr.orig_attr.max_sge;
+ priv->max_recv_sge = dev_attr.orig_attr.max_sge;
+
+ priv->max_mr = dev_attr.orig_attr.max_mr;
+ priv->max_mr_size =
dev_attr.orig_attr.max_mr_size;
+
+ DRV_LOG(INFO, "dev %s max queues %d desc %d
sge %d",
+ name, priv->max_rx_queues, priv-
max_rx_desc,
+ priv->max_send_sge);
+
+ rte_spinlock_lock(&mana_shared_data->lock);
+ mana_shared_data->primary_cnt++;
+ rte_spinlock_unlock(&mana_shared_data->lock);
+
+ eth_dev->data->dev_flags |=
RTE_ETH_DEV_INTR_RMV;
+
+ eth_dev->device = &pci_dev->device;
+ eth_dev->data->dev_flags |=
+ RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
+
Please do not use the temporary macro. Please review this patch:
f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue
xstats")
This patch requires that per queue statistics are filled in
.xstats_get() by PMD.