From: Long Li <lon...@microsoft.com>

MANA hardware has iommu built-in, that provides hardware safe access to user
memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registartion
cache mechanisum for each queue and for each port.

Signed-off-by: Long Li <lon...@microsoft.com>
---
 drivers/net/mana/mana.c      |  20 +++
 drivers/net/mana/mana.h      |  38 ++++
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/mp.c        |  85 +++++++++
 drivers/net/mana/mr.c        | 339 +++++++++++++++++++++++++++++++++++
 5 files changed, 483 insertions(+)
 create mode 100644 drivers/net/mana/mr.c

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 6b1c3ee035..6c8983cd6a 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -132,6 +132,8 @@ mana_dev_close(struct rte_eth_dev *dev)
        struct mana_priv *priv = dev->data->dev_private;
        int ret;
 
+       remove_all_mr(priv);
+
        ret = mana_intr_uninstall(priv);
        if (ret)
                return ret;
@@ -346,6 +348,13 @@ static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
                goto fail;
        }
 
+       ret = mana_mr_btree_init(&txq->mr_btree,
+                                MANA_MR_BTREE_PER_QUEUE_N, 0);
+       if (ret) {
+               DRV_LOG(ERR, "Failed to init TXQ MR btree");
+               goto fail;
+       }
+
        DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
                queue_idx, nb_desc, socket_id, txq->desc_ring);
 
@@ -367,6 +376,8 @@ static void mana_dev_tx_queue_release(struct rte_eth_dev 
*dev, uint16_t qid)
 {
        struct mana_txq *txq = dev->data->tx_queues[qid];
 
+       mana_mr_btree_free(&txq->mr_btree);
+
        rte_free(txq->desc_ring);
        rte_free(txq);
 }
@@ -403,6 +414,13 @@ static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
                goto fail;
        }
 
+       ret = mana_mr_btree_init(&rxq->mr_btree,
+                                MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+       if (ret) {
+               DRV_LOG(ERR, "Failed to init RXQ MR btree");
+               goto fail;
+       }
+
        rxq->num_desc = nb_desc;
 
        rxq->priv = priv;
@@ -422,6 +440,8 @@ static void mana_dev_rx_queue_release(struct rte_eth_dev 
*dev, uint16_t qid)
 {
        struct mana_rxq *rxq = dev->data->rx_queues[qid];
 
+       mana_mr_btree_free(&rxq->mr_btree);
+
        rte_free(rxq->desc_ring);
        rte_free(rxq);
 }
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 1efb2330ee..b1ef9ce60b 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -50,6 +50,22 @@ struct mana_shared_data {
 #define MAX_RECEIVE_BUFFERS_PER_QUEUE  256
 #define MAX_SEND_BUFFERS_PER_QUEUE     256
 
+struct mana_mr_cache {
+       uint32_t        lkey;
+       uintptr_t       addr;
+       size_t          len;
+       void            *verb_obj;
+};
+
+#define MANA_MR_BTREE_CACHE_N  512
+struct mana_mr_btree {
+       uint16_t        len;    /* Used entries */
+       uint16_t        size;   /* Total entries */
+       int             overflow;
+       int             socket;
+       struct mana_mr_cache *table;
+};
+
 struct mana_process_priv {
        void *db_page;
 };
@@ -82,6 +98,7 @@ struct mana_priv {
        int max_recv_sge;
        int max_mr;
        uint64_t max_mr_size;
+       struct mana_mr_btree mr_btree;
        rte_rwlock_t    mr_list_lock;
 };
 
@@ -132,6 +149,7 @@ struct mana_txq {
        uint32_t desc_ring_head, desc_ring_tail;
 
        struct mana_stats stats;
+       struct mana_mr_btree mr_btree;
        unsigned int socket;
 };
 
@@ -154,6 +172,7 @@ struct mana_rxq {
        struct mana_gdma_queue gdma_cq;
 
        struct mana_stats stats;
+       struct mana_mr_btree mr_btree;
 
        unsigned int socket;
 };
@@ -177,6 +196,24 @@ uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
                               uint16_t pkts_n);
 
+struct mana_mr_cache *find_pmd_mr(struct mana_mr_btree *local_tree,
+                                 struct mana_priv *priv,
+                                 struct rte_mbuf *mbuf);
+int new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+              struct rte_mempool *pool);
+void remove_all_mr(struct mana_priv *priv);
+void del_pmd_mr(struct mana_mr_cache *mr);
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque,
+                          struct rte_mempool_memhdr *memhdr, unsigned int idx);
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+                                          uint16_t *idx,
+                                          uintptr_t addr, size_t len);
+int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache 
*entry);
+int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket);
+void mana_mr_btree_free(struct mana_mr_btree *bt);
+
 /** Request timeout for IPC. */
 #define MANA_MP_REQ_TIMEOUT_SEC 5
 
@@ -205,6 +242,7 @@ int mana_mp_init_secondary(void);
 void mana_mp_uninit_primary(void);
 void mana_mp_uninit_secondary(void);
 int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t 
len);
 
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 7ab34c253c..fc0dbaabb3 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
        'mana.c',
+       'mr.c',
        'mp.c',
 )
 
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index b2f5f7ab49..9cb3c09d32 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -34,6 +34,52 @@
 
 extern struct mana_shared_data *mana_shared_data;
 
+static int mana_mp_mr_create(struct mana_priv *priv, uintptr_t addr,
+                            uint32_t len)
+{
+       struct ibv_mr *ibv_mr;
+       int ret;
+       struct mana_mr_cache *mr;
+
+       ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)addr, len,
+                           IBV_ACCESS_LOCAL_WRITE);
+
+       if (!ibv_mr)
+               return -errno;
+
+       DRV_LOG(DEBUG, "MR (2nd) lkey %u addr %px len 0x%lx",
+               ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
+
+       mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
+       if (!mr) {
+               DRV_LOG(ERR, "(2nd) Failed to allocate MR");
+               ret = -ENOMEM;
+               goto fail_alloc;
+       }
+       mr->lkey = ibv_mr->lkey;
+       mr->addr = (uintptr_t)ibv_mr->addr;
+       mr->len = ibv_mr->length;
+       mr->verb_obj = ibv_mr;
+
+       rte_rwlock_write_lock(&priv->mr_list_lock);
+       ret = mana_mr_btree_insert(&priv->mr_btree, mr);
+       rte_rwlock_write_unlock(&priv->mr_list_lock);
+       if (ret) {
+               DRV_LOG(ERR, "(2nd) Failed to add to global MR btree");
+               goto fail_btree;
+       }
+
+       return 0;
+
+fail_btree:
+       rte_free(mr);
+
+fail_alloc:
+       ibv_dereg_mr(ibv_mr);
+
+       return ret;
+}
+
 static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type,
                        int port_id)
 {
@@ -69,6 +115,12 @@ static int mana_mp_primary_handle(const struct rte_mp_msg 
*mp_msg,
        mp_init_msg(&mp_res, param->type, param->port_id);
 
        switch (param->type) {
+       case MANA_MP_REQ_CREATE_MR:
+               ret = mana_mp_mr_create(priv, param->addr, param->len);
+               res->result = ret;
+               ret = rte_mp_reply(&mp_res, peer);
+               break;
+
        case MANA_MP_REQ_VERBS_CMD_FD:
                mp_res.num_fds = 1;
                mp_res.fds[0] = priv->ib_ctx->cmd_fd;
@@ -211,6 +263,39 @@ int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
        return ret;
 }
 
+int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
+{
+       struct rte_mp_msg mp_req = { 0 };
+       struct rte_mp_msg *mp_res;
+       struct rte_mp_reply mp_rep;
+       struct mana_mp_param *req = (struct mana_mp_param *)mp_req.param;
+       struct mana_mp_param *res;
+       struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+       int ret;
+
+       mp_init_msg(&mp_req, MANA_MP_REQ_CREATE_MR, priv->port_id);
+       req->addr = addr;
+       req->len = len;
+
+       ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+       if (ret) {
+               DRV_LOG(ERR, "Port %u request to primary failed",
+                       req->port_id);
+               return ret;
+       }
+
+       if (mp_rep.nb_received != 1)
+               return -EPROTO;
+
+       mp_res = &mp_rep.msgs[0];
+       res = (struct mana_mp_param *)mp_res->param;
+       ret = res->result;
+
+       free(mp_rep.msgs);
+
+       return ret;
+}
+
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 {
        struct rte_mp_msg mp_req = { 0 };
diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c
new file mode 100644
index 0000000000..926b3a6ebc
--- /dev/null
+++ b/drivers/net/mana/mr.c
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_common.h>
+#include <rte_kvargs.h>
+#include <rte_rwlock.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+#include <rte_alarm.h>
+#include <rte_log.h>
+#include <rte_eal_paging.h>
+#include <rte_io.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include "mana.h"
+
+struct mana_range {
+       uintptr_t       start;
+       uintptr_t       end;
+       uint32_t        len;
+};
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+                          struct rte_mempool_memhdr *memhdr, unsigned int idx)
+{
+       struct mana_range *ranges = opaque;
+       struct mana_range *range = &ranges[idx];
+       uint64_t page_size = rte_mem_page_size();
+
+       range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
+       range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
+                                   page_size);
+       range->len = range->end - range->start;
+}
+
+int new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+              struct rte_mempool *pool)
+{
+       struct ibv_mr *ibv_mr;
+       struct mana_range ranges[pool->nb_mem_chunks];
+       uint32_t i;
+       struct mana_mr_cache *mr;
+       int ret;
+
+       rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
+
+       for (i = 0; i < pool->nb_mem_chunks; i++) {
+               if (ranges[i].len > priv->max_mr_size) {
+                       DRV_LOG(ERR, "memory chunk size %u exceeding max MR\n",
+                               ranges[i].len);
+                       return -ENOMEM;
+               }
+
+               DRV_LOG(DEBUG, "registering memory chunk start 0x%lx len 0x%x",
+                       ranges[i].start, ranges[i].len);
+
+               if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+                       /* Send a message to the primary to do MR */
+                       ret = mana_mp_req_mr_create(priv, ranges[i].start,
+                                                   ranges[i].len);
+                       if (ret) {
+                               DRV_LOG(ERR, "MR failed start 0x%lx len 0x%x",
+                                       ranges[i].start, ranges[i].len);
+                               return ret;
+                       }
+                       continue;
+               }
+
+               ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
+                                   ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
+               if (ibv_mr) {
+                       DRV_LOG(DEBUG, "MR lkey %u addr %px len 0x%lx",
+                               ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
+
+                       mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
+                       mr->lkey = ibv_mr->lkey;
+                       mr->addr = (uintptr_t)ibv_mr->addr;
+                       mr->len = ibv_mr->length;
+                       mr->verb_obj = ibv_mr;
+
+                       rte_rwlock_write_lock(&priv->mr_list_lock);
+                       ret = mana_mr_btree_insert(&priv->mr_btree, mr);
+                       rte_rwlock_write_unlock(&priv->mr_list_lock);
+                       if (ret) {
+                               ibv_dereg_mr(ibv_mr);
+                               DRV_LOG(ERR, "Failed to add to global MR 
btree");
+                               return ret;
+                       }
+
+                       ret = mana_mr_btree_insert(local_tree, mr);
+                       if (ret) {
+                               /* Don't need to clean up MR as it's already
+                                * in the global tree
+                                */
+                               DRV_LOG(ERR, "Failed to add to local MR btree");
+                               return ret;
+                       }
+               } else {
+                       DRV_LOG(ERR, "MR failed start 0x%lx len 0x%x",
+                               ranges[i].start, ranges[i].len);
+                       return -errno;
+               }
+       }
+       return 0;
+}
+
+void del_pmd_mr(struct mana_mr_cache *mr)
+{
+       int ret;
+       struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
+
+       ret = ibv_dereg_mr(ibv_mr);
+       if (ret)
+               DRV_LOG(ERR, "dereg MR failed ret %d", ret);
+}
+
+struct mana_mr_cache *find_pmd_mr(struct mana_mr_btree *local_mr_btree,
+                                 struct mana_priv *priv, struct rte_mbuf *mbuf)
+{
+       struct rte_mempool *pool = mbuf->pool;
+       int ret, second_try = 0;
+       struct mana_mr_cache *mr;
+       uint16_t idx;
+
+       DRV_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
+               mbuf->buf_addr, mbuf->buf_len);
+
+try_again:
+       /* First try to find the MR in local queue tree */
+       mr = mana_mr_btree_lookup(local_mr_btree, &idx,
+                                 (uintptr_t)mbuf->buf_addr, mbuf->buf_len);
+       if (mr) {
+               DRV_LOG(DEBUG, "Local mr lkey %u addr %lx len %lu",
+                       mr->lkey, mr->addr, mr->len);
+               return mr;
+       }
+
+       /* If not found, try to find the MR in global tree */
+       rte_rwlock_read_lock(&priv->mr_list_lock);
+       mr = mana_mr_btree_lookup(&priv->mr_btree, &idx,
+                                 (uintptr_t)mbuf->buf_addr,
+                                 mbuf->buf_len);
+       rte_rwlock_read_unlock(&priv->mr_list_lock);
+
+       /* If found in the global tree, add it to the local tree */
+       if (mr) {
+               ret = mana_mr_btree_insert(local_mr_btree, mr);
+               if (ret) {
+                       DRV_LOG(DEBUG, "Failed to add MR to local tree.");
+                       return NULL;
+               }
+
+               DRV_LOG(DEBUG, "Added local mr lkey %u addr %lx len %lu",
+                       mr->lkey, mr->addr, mr->len);
+               return mr;
+       }
+
+       if (second_try) {
+               DRV_LOG(ERR, "Internal error second try failed");
+               return NULL;
+       }
+
+       ret = new_pmd_mr(local_mr_btree, priv, pool);
+       if (ret) {
+               DRV_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
+                       ret, mbuf->buf_addr, mbuf->buf_len);
+               return NULL;
+       }
+
+       second_try = 1;
+       goto try_again;
+}
+
+void remove_all_mr(struct mana_priv *priv)
+{
+       struct mana_mr_btree *bt = &priv->mr_btree;
+       struct mana_mr_cache *mr;
+       struct ibv_mr *ibv_mr;
+       uint16_t i;
+
+       rte_rwlock_write_lock(&priv->mr_list_lock);
+       /* Start with index 1 as the 1st entry is always NULL */
+       for (i = 1; i < bt->len; i++) {
+               mr = &bt->table[i];
+               ibv_mr = mr->verb_obj;
+               ibv_dereg_mr(ibv_mr);
+       }
+       bt->len = 1;
+       rte_rwlock_write_unlock(&priv->mr_list_lock);
+}
+
+static int mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
+{
+       void *mem;
+
+       mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
+                                0, bt->socket);
+       if (!mem) {
+               DRV_LOG(ERR, "Failed to expand btree size %d", n);
+               return -1;
+       }
+
+       DRV_LOG(ERR, "Expanded btree to size %d", n);
+       bt->table = mem;
+       bt->size = n;
+
+       return 0;
+}
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+                                          uint16_t *idx,
+                                          uintptr_t addr, size_t len)
+{
+       struct mana_mr_cache *table;
+       uint16_t n;
+       uint16_t base = 0;
+       int ret;
+
+       n = bt->len;
+
+       /* Try to double the cache if it's full */
+       if (n == bt->size) {
+               ret = mana_mr_btree_expand(bt, bt->size << 1);
+               if (ret)
+                       return NULL;
+       }
+
+       table = bt->table;
+
+       /* Do binary search on addr */
+       do {
+               uint16_t delta = n >> 1;
+
+               if (addr < table[base + delta].addr) {
+                       n = delta;
+               } else {
+                       base += delta;
+                       n -= delta;
+               }
+       } while (n > 1);
+
+       *idx = base;
+
+       if (addr + len <= table[base].addr + table[base].len)
+               return &table[base];
+
+       DRV_LOG(DEBUG, "addr %lx len %lu idx %u sum %lx not found",
+               addr, len, *idx, addr + len);
+
+       return NULL;
+}
+
+int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
+{
+       memset(bt, 0, sizeof(*bt));
+       bt->table = rte_calloc_socket("MANA B-tree table",
+                                     n,
+                                     sizeof(struct mana_mr_cache),
+                                     0, socket);
+       if (!bt->table) {
+               DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
+                       n, socket);
+               return -ENOMEM;
+       }
+
+       bt->socket = socket;
+       bt->size = n;
+
+       /* First entry must be NULL for binary search to work */
+       bt->table[0] = (struct mana_mr_cache) {
+               .lkey = UINT32_MAX,
+       };
+       bt->len = 1;
+
+       DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
+               bt->table, n, bt->len);
+
+       return 0;
+}
+
+void mana_mr_btree_free(struct mana_mr_btree *bt)
+{
+       rte_free(bt->table);
+       memset(bt, 0, sizeof(*bt));
+}
+
+int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
+{
+       struct mana_mr_cache *table;
+       uint16_t idx = 0;
+       uint16_t shift;
+
+       if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) {
+               DRV_LOG(DEBUG, "Addr %lx len %lu exists in btree",
+                       entry->addr, entry->len);
+               return 0;
+       }
+
+       if (bt->len >= bt->size) {
+               bt->overflow = 1;
+               return -1;
+       }
+
+       table = bt->table;
+
+       idx++;
+       shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
+       if (shift) {
+               DRV_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
+                       shift, idx, idx + 1);
+               memmove(&table[idx + 1], &table[idx], shift);
+       }
+
+       table[idx] = *entry;
+       bt->len++;
+
+       DRV_LOG(DEBUG, "Inserted MR b-tree table %p idx %d addr %lx len %lu",
+               table, idx, entry->addr, entry->len);
+
+       return 0;
+}
-- 
2.17.1

Reply via email to