From: xiongweimin <[email protected]>

This commit adds foundational resource management capabilities to the
vhost-user RDMA driver, enabling full RDMA operations:

1. Memory Region (MR) Management:
   - DMA MR registration via GET_DMA_MR
   - Two-level page table for large scatter-gather lists
   - CREATE_MR/DEREG_MR backend command flow
   - Atomic command execution with virtqueue

2. Global Identifier (GID) Management:
   - ADD_GID/DEL_GID backend commands
   - RoCE v1/v2 GID type support
   - Port-based GID table operations

3. User Context (ucontext) Support:
   - Allocation and deallocation hooks
   - Device association for future PD/CQ/MR management

4. Address Handle (AH) Management:
   - RoCE-specific AH creation/validation
   - Unicast GRH enforcement
   - Device-wide AH limit tracking

Key technical features:
- MRs support both DMA-direct and user-backed registrations
- Page-table optimized for large scatter-lists
- GID operations integrate with RDMA core notifications
- AHs store full address vectors for packet construction
- Resource limits enforced via atomic counters

Signed-off-by: Xiong Weimin <[email protected]>
---
 .../infiniband/hw/virtio/vrdma_dev_api.h      |  40 ++
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 600 ++++++++++++++++++
 .../drivers/infiniband/hw/virtio/vrdma_ib.h   |  80 +++
 3 files changed, 720 insertions(+)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index d1db1bea4..da99f1f32 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -160,6 +160,46 @@ struct vrdma_cmd_destroy_qp {
     __u32 qpn;
 };
 
+struct vrdma_cmd_get_dma_mr {
+       __u32 pdn;
+       __u32 access_flags;
+};
+
+struct vrdma_rsp_get_dma_mr {
+       __u32 mrn;
+       __u32 lkey;
+       __u32 rkey;
+};
+
+struct vrdma_cmd_create_mr {
+       __u32 pdn;
+       __u32 access_flags;
+
+       __u32 max_num_sg;
+};
+
+struct vrdma_rsp_create_mr {
+       __u32 mrn;
+       __u32 lkey;
+       __u32 rkey;
+};
+
+struct vrdma_cmd_dereg_mr {
+    __u32 mrn;
+};
+
+struct vrdma_cmd_add_gid {
+       __u8 gid[16];
+       __u32 gid_type;
+       __u16 index;
+       __u32 port_num;
+};
+
+struct vrdma_cmd_del_gid {
+       __u16 index;
+       __u32 port_num;
+};
+
 #define VRDMA_CTRL_OK  0
 #define VRDMA_CTRL_ERR 1
 
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index f1f53314f..b4c16ddbb 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -1086,6 +1086,597 @@ static int vrdma_destroy_qp(struct ib_qp *ibqp, struct 
ib_udata *udata)
        return rc;
 }
 
+/**
+ * vrdma_get_dma_mr - Get a DMA memory region (uncached, direct-access MR)
+ * @pd:                Protection Domain to associate this MR with
+ * @flags:     Access permissions (IB_ACCESS_LOCAL_WRITE, 
IB_ACCESS_REMOTE_READ, etc.)
+ *
+ * This function creates a special type of Memory Region (MR) that refers to
+ * physically contiguous or scatter-gather DMA-capable memory, typically used
+ * for zero-copy or kernel-space registrations without user buffer backing.
+ *
+ * It issues the VIRTIO_RDMA_CMD_GET_DMA_MR command to the backend device,
+ * which returns:
+ *   - An MR handle (mrn)
+ *   - Local Key (lkey)
+ *   - Remote Key (rkey)
+ *
+ * Unlike regular MRs created via ib_reg_mr(), this MR does not back any
+ * user-space virtual memory (i.e., no ib_umem). It is typically used for
+ * device-specific buffers, scratch memory, or control structures.
+ *
+ * Context: Called in process context. May sleep.
+ * Return:
+ * * &mr->ibmr on success
+ * * ERR_PTR(-ENOMEM) if memory allocation fails
+ * * ERR_PTR(-EIO) if device communication fails
+ */
+static struct ib_mr *vrdma_get_dma_mr(struct ib_pd *pd, int flags)
+{
+       struct vrdma_dev *vdev = to_vdev(pd->device);
+       struct vrdma_mr *mr;
+       struct vrdma_cmd_get_dma_mr *cmd;
+       struct vrdma_rsp_get_dma_mr *rsp;
+       struct scatterlist in, out;
+       int rc;
+
+       /* Allocate software MR structure */
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr)
+               return ERR_PTR(-ENOMEM);
+
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd) {
+               rc = -ENOMEM;
+               goto err_free_mr;
+       }
+
+       rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+       if (!rsp) {
+               rc = -ENOMEM;
+               goto err_free_cmd;
+       }
+
+       /* Prepare command parameters */
+       cmd->pdn = to_vpd(pd)->pd_handle;
+       cmd->access_flags = flags;
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+       sg_init_one(&out, rsp, sizeof(*rsp));
+
+       /* Send GET_DMA_MR command to device */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_GET_DMA_MR, &in, &out);
+       if (rc) {
+               dev_err(&vdev->vdev->dev,
+                       "GET_DMA_MR command failed: %d\n", rc);
+               goto err_free_rsp;
+       }
+
+       /* Initialize MR fields from response */
+       mr->mr_handle = rsp->mrn;
+       mr->ibmr.lkey = rsp->lkey;
+       mr->ibmr.rkey = rsp->rkey;
+       mr->ibmr.pd = pd;
+       mr->ibmr.device = pd->device;
+       mr->ibmr.type = IB_MR_TYPE_MEM_REG; /* Standard memory registration */
+
+       /* No backing user memory */
+       mr->umem = NULL;
+       mr->iova = 0;
+       mr->size = 0;
+       mr->pages = NULL;
+       mr->pages_k = NULL;
+       mr->dma_pages = 0;
+       mr->npages = 0;
+       mr->max_pages = 0;
+
+       /* Cleanup command/response buffers */
+       kfree(cmd);
+       kfree(rsp);
+
+       return &mr->ibmr;
+
+err_free_rsp:
+       kfree(rsp);
+
+err_free_cmd:
+       kfree(cmd);
+
+err_free_mr:
+       kfree(mr);
+       return ERR_PTR(rc);
+}
+
+/**
+ * vrdma_init_page_tbl - Initialize a two-level page table for MR management
+ * @dev:               vRDMA device pointer
+ * @npages:            Maximum number of data pages this table can map
+ * @pages_dma:         Output: L1 table with entries pointing to DMA addresses 
of L2 tables
+ * @dma_pages_p:       Output: DMA address of the L1 table itself
+ *
+ * This function sets up a two-level page table structure used in Memory 
Region (MR)
+ * registration to support scatter-gather I/O. The layout is:
+ *
+ *   L1 (Level 1): Single page, DMA-coherent, holds pointers to L2 tables.
+ *                 Will be passed to hardware via WQE or command.
+ *
+ *   L2 (Level 2): Array of pages, each holding up to 512 x 8-byte DMA 
addresses
+ *                 (for 4KB page size). Each L2 table maps part of the S/G 
list.
+ *
+ * Example:
+ *   npages = 1024  => needs 1024 / 512 = 2 L2 tables
+ *
+ * Return:
+ *   Pointer to kernel virtual address of L1 table (pages_k), which stores
+ *   virtual addresses of L2 tables for cleanup.
+ *   On failure, returns NULL and cleans up all allocated memory.
+ */
+static uint64_t **vrdma_init_page_tbl(struct vrdma_dev *dev,
+                                     unsigned int npages,
+                                     uint64_t ***pages_dma,
+                                     dma_addr_t *dma_pages_p)
+{
+       unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512; /* 
ceil(npages / 512) */
+       uint64_t **l1_table;           /* L1: stores DMA addrs of L2s 
(device-readable) */
+       uint64_t **l1_table_k;         /* L1: stores kernel vaddrs of L2s (for 
free) */
+       dma_addr_t l1_dma_addr;
+       dma_addr_t l2_dma_addr;
+       int i;
+
+       /* Allocate L1 table: must be DMA-coherent because device reads it */
+       l1_table = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, 
&l1_dma_addr, GFP_KERNEL);
+       if (!l1_table)
+               return NULL;
+
+       /* Allocate kernel-space array to track L2 virtual addresses */
+       l1_table_k = kzalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!l1_table_k)
+               goto err_free_l1_table;
+
+       /* Allocate each L2 table (DMA-coherent, one per 512 entries) */
+       for (i = 0; i < nl2; i++) {
+               l1_table_k[i] = dma_alloc_coherent(dev->vdev->dev.parent, 
PAGE_SIZE, &l2_dma_addr, GFP_KERNEL);
+               if (!l1_table_k[i])
+                       goto err_free_l2_tables;
+
+               l1_table[i] = (uint64_t *)l2_dma_addr; /* Device sees DMA 
address */
+       }
+
+       /* Output parameters */
+       *pages_dma = l1_table;        /* Device-visible L1 (with DMA pointers) 
*/
+       *dma_pages_p = l1_dma_addr;   /* DMA address of L1 table */
+
+       return l1_table_k; /* Return kernel view for later cleanup */
+
+err_free_l2_tables:
+       /* Roll back any successfully allocated L2 tables */
+       while (--i >= 0) {
+               dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, 
l1_table_k[i], (dma_addr_t)l1_table[i]);
+       }
+       kfree(l1_table_k);
+
+err_free_l1_table:
+       dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table, 
l1_dma_addr);
+
+       return NULL;
+}
+
+/**
+ * vrdma_free_page_tbl - Free a two-level page table
+ * @dev:               vRDMA device
+ * @pages_k:           Return value from vrdma_init_page_tbl (kernel L2 
pointers)
+ * @pages:             L1 table with DMA addresses (output of pages_dma)
+ * @dma_pages:         DMA address of L1 table
+ * @npages:            Number of pages that were to be supported
+ *
+ * Frees both L1 and all L2 page tables allocated by vrdma_init_page_tbl.
+ */
+static void vrdma_free_page_tbl(struct vrdma_dev *dev,
+                               uint64_t **pages_k,
+                               uint64_t **pages,
+                               dma_addr_t dma_pages,
+                               unsigned int npages)
+{
+       unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512;
+       int i;
+
+       if (!pages_k || !pages)
+               return;
+
+       /* Free all L2 tables */
+       for (i = 0; i < nl2; i++) {
+               if (pages_k[i])
+                       dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, 
pages_k[i],
+                                         virt_to_phys((void *)pages[i]));
+       }
+
+       /* Free L1 tracking array */
+       kfree(pages_k);
+
+       /* Free L1 DMA table */
+       dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages, dma_pages);
+}
+
+/**
+ * vrdma_alloc_mr - Allocate a multi-segment Memory Region (MR) with page 
tables
+ * @pd:                Protection Domain to associate the MR with
+ * @mr_type:   Type of MR (must be IB_MR_TYPE_MEM_REG)
+ * @max_num_sg:        Maximum number of scatter/gather entries supported by 
this MR
+ *
+ * This function allocates a software MR structure and reserves a hardware MR
+ * context on the backend vRDMA device. It prepares a two-level page table
+ * (L1/L2) to support up to @max_num_sg pages, which will later be filled 
during
+ * memory registration (e.g., via ib_update_page()).
+ *
+ * The allocated MR is not yet backed by any actual memory - it serves as a
+ * container for future page population (used primarily by ib_get_dma_mr() path
+ * or special fast-register mechanisms).
+ *
+ * Command flow:
+ *   - Sends VIRTIO_RDMA_CMD_CREATE_MR to device
+ *   - Receives mr_handle, lkey, rkey from response
+ *
+ * Context: Called in process context. May sleep.
+ * Return:
+ * * &mr->ibmr on success
+ * * ERR_PTR(-EINVAL) if unsupported MR type
+ * * ERR_PTR(-ENOMEM) if memory allocation fails
+ * * ERR_PTR(-EIO) if device command fails
+ */
+static struct ib_mr *vrdma_alloc_mr(struct ib_pd *pd,
+                                   enum ib_mr_type mr_type,
+                                   u32 max_num_sg)
+{
+       struct vrdma_dev *vdev = to_vdev(pd->device);
+       struct vrdma_mr *mr;
+       struct vrdma_cmd_create_mr *cmd;
+       struct vrdma_rsp_create_mr *rsp;
+       struct scatterlist in, out;
+       int rc;
+
+       /* Only support standard memory registration */
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       /* Allocate software MR structure */
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr)
+               return ERR_PTR(-ENOMEM);
+
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd) {
+               rc = -ENOMEM;
+               goto err_free_mr;
+       }
+
+       rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+       if (!rsp) {
+               rc = -ENOMEM;
+               goto err_free_cmd;
+       }
+
+       /*
+        * Allocate two-level page table for S/G support.
+        * Each L2 table holds PAGE_SIZE / sizeof(u64) entries.
+        * L1 table points to multiple L2s.
+        */
+       mr->pages_k = vrdma_init_page_tbl(vdev, max_num_sg,
+                                               &mr->pages, &mr->dma_pages);
+       if (!mr->pages_k) {
+               dev_err(&vdev->vdev->dev,
+                       "Failed to allocate page table for %u S/G entries\n",
+                       max_num_sg);
+               rc = -ENOMEM;
+               goto err_free_rsp;
+       }
+
+       mr->max_pages = max_num_sg;
+       mr->npages = 0;
+       mr->umem = NULL; /* No user memory backing at this stage */
+       mr->iova = 0;
+       mr->size = 0;
+
+       /* Prepare command */
+       cmd->pdn = to_vpd(pd)->pd_handle;
+       cmd->max_num_sg = max_num_sg;
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+       sg_init_one(&out, rsp, sizeof(*rsp));
+
+       /* Send CREATE_MR command to backend device */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_CREATE_MR, &in, &out);
+       if (rc) {
+               dev_err(&vdev->vdev->dev, "CREATE_MR failed: %d\n", rc);
+               goto err_free_page_tbl;
+       }
+
+       /* Initialize MR metadata from response */
+       mr->mr_handle = rsp->mrn;
+       mr->ibmr.lkey = rsp->lkey;
+       mr->ibmr.rkey = rsp->rkey;
+       mr->ibmr.pd = pd;
+       mr->ibmr.device = &vdev->ib_dev;
+       mr->ibmr.type = IB_MR_TYPE_MEM_REG;
+
+       /* Clean up command/response buffers */
+       kfree(cmd);
+       kfree(rsp);
+
+       return &mr->ibmr;
+
+err_free_page_tbl:
+       vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages,
+                                 max_num_sg);
+err_free_rsp:
+       kfree(rsp);
+err_free_cmd:
+       kfree(cmd);
+err_free_mr:
+       kfree(mr);
+       return ERR_PTR(rc);
+}
+
+/**
+ * vrdma_dereg_mr - Deregister and destroy a Memory Region (MR)
+ * @ibmr:      The IB memory region to deregister
+ * @udata:     User data (optional, for user-space MRs)
+ *
+ * This function unregisters a previously allocated MR from the vRDMA device.
+ * It performs the following steps:
+ *   1. Sends VIRTIO_RDMA_CMD_DEREG_MR command to the backend device
+ *   2. Frees software page tables (L1/L2) used for scatter-gather mapping
+ *   3. Releases user memory (if any) via ib_umem_release()
+ *   4. Frees local metadata (struct vrdma_mr)
+ *
+ * Context: Can be called in process context. May sleep.
+ * Return:
+ * * 0 on success
+ * * -EIO if device communication fails
+ * * Other negative errno codes on allocation failure (rare during dereg)
+ */
+static int vrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+       struct vrdma_dev *vdev = to_vdev(ibmr->device);
+       struct vrdma_mr *mr = to_vmr(ibmr);
+       struct vrdma_cmd_dereg_mr *cmd;
+       struct scatterlist in;
+       int rc;
+
+       /* Allocate command buffer */
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd)
+               return -ENOMEM;
+
+       /* Prepare command */
+       cmd->mrn = mr->mr_handle;
+       sg_init_one(&in, cmd, sizeof(*cmd));
+
+       /* Notify hardware to release MR context */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEREG_MR, &in, NULL);
+       if (rc) {
+               dev_err(&vdev->vdev->dev,
+                       "VIRTIO_RDMA_CMD_DEREG_MR failed for mrn=0x%x, 
err=%d\n",
+                       mr->mr_handle, rc);
+               rc = -EIO;
+               goto out_free_cmd;
+       }
+
+       /* Free two-level page table used for S/G entries */
+       vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages, 
mr->max_pages);
+
+       /* Release user memory if present */
+       if (mr->umem)
+               ib_umem_release(mr->umem);
+
+       /* Success */
+       kfree(cmd);
+       return 0;
+
+out_free_cmd:
+       kfree(cmd);
+       return rc;
+}
+
+/**
+ * vrdma_add_gid - Add a GID (Global Identifier) entry to the hardware
+ * @attr:      GID attribute containing port, index, GID value, and GID type
+ * @context:   Pointer to store driver-specific context (unused in vRDMA)
+ *
+ * This callback is invoked by the RDMA core when a GID table entry is added,
+ * either manually via sysfs or automatically during IPv6 address assignment.
+ *
+ * The function sends VIRTIO_RDMA_CMD_ADD_GID to the backend device to register
+ * the GID at the specified index and port. This allows the device to use this
+ * GID for RoCE traffic (e.g., as source in GRH).
+ *
+ * Note: The @context parameter is unused in vRDMA drivers since no additional
+ * per-GID software state is maintained.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if kmalloc fails
+ * * -EIO if device command fails
+ */
+static int vrdma_add_gid(const struct ib_gid_attr *attr, void **context)
+{
+       struct vrdma_dev *vdev = to_vdev(attr->device);
+       struct vrdma_cmd_add_gid *cmd;
+       struct scatterlist in;
+       int rc;
+
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd)
+               return -ENOMEM;
+
+       /* Fill command parameters */
+       memcpy(cmd->gid, attr->gid.raw, sizeof(cmd->gid));
+       cmd->index = attr->index;
+       cmd->port_num = attr->port_num;
+       cmd->gid_type = attr->gid_type; /* e.g., IB_GID_TYPE_ROCE or 
IB_GID_TYPE_ROCE_UDP_ENCAP */
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+
+       /* Send command to backend */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_ADD_GID, &in, NULL);
+       if (rc)
+               dev_err(&vdev->vdev->dev,
+                       "ADD_GID failed: port=%u index=%u type=%d, err=%d\n",
+                       attr->port_num, attr->index, attr->gid_type, rc);
+
+       kfree(cmd);
+       return rc ? -EIO : 0;
+}
+
+/**
+ * vrdma_del_gid - Remove a GID entry from the hardware
+ * @attr:      GID attribute specifying which GID to delete (by index/port)
+ * @context:   Driver-specific context (passed from add_gid; unused here)
+ *
+ * This callback is called when a GID is removed from the GID table.
+ * It notifies the backend device to invalidate the GID mapping at the given 
index.
+ *
+ * The @context pointer is ignored because vRDMA does not maintain per-GID 
software state.
+ *
+ * Context: Can sleep (process context).
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if allocation fails
+ * * -EIO if device command fails
+ */
+static int vrdma_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+       struct vrdma_dev *vdev = to_vdev(attr->device);
+       struct vrdma_cmd_del_gid *cmd;
+       struct scatterlist in;
+       int rc;
+
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd)
+               return -ENOMEM;
+
+       /* Only index and port are needed to identify the GID */
+       cmd->index = attr->index;
+       cmd->port_num = attr->port_num;
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+
+       /* Send command to backend */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEL_GID, &in, NULL);
+       if (rc)
+               dev_err(&vdev->vdev->dev,
+                       "DEL_GID failed: port=%u index=%u, err=%d\n",
+                       attr->port_num, attr->index, rc);
+
+       kfree(cmd);
+       return rc ? -EIO : 0;
+}
+
+static int vrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata 
*udata)
+{
+       struct vrdma_ucontext *vuc = to_vucontext(uctx);
+       
+       vuc->dev = to_vdev(uctx->device);
+
+       return 0;
+}
+
+static void vrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+}
+
+/**
+ * vrdma_create_ah - Create an Address Handle (AH) for RoCE communication
+ * @ibah:      IB address handle to initialize
+ * @init_attr: AH initialization attributes
+ * @udata:     User data (unused in vRDMA)
+ *
+ * This function creates a software-only Address Handle (AH), which represents
+ * a remote destination for UD or RC QP sends. Since this is a virtualized 
driver,
+ * no hardware command is sent; instead, the AH context is stored locally in
+ * struct vrdma_ah for later use during packet construction.
+ *
+ * The AH must:
+ *   - Be RoCE type
+ *   - Contain GRH (Global Routing Header)
+ *   - Not be multicast (currently unsupported)
+ *
+ * Also enforces device limit on maximum number of active AHs via atomic 
counter.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * 0 on success
+ * * -EINVAL if attributes are invalid
+ * * -ENOMEM if AH limit exceeded
+ */
+static int vrdma_create_ah(struct ib_ah *ibah,
+                          struct rdma_ah_init_attr *init_attr,
+                          struct ib_udata *udata)
+{
+       struct vrdma_dev *vdev = to_vdev(ibah->device);
+       struct vrdma_ah *ah = to_vah(ibah);
+       const struct ib_global_route *grh;
+       u32 port_num = rdma_ah_get_port_num(init_attr->ah_attr);
+
+       /* Must have GRH enabled */
+       if (!(rdma_ah_get_ah_flags(init_attr->ah_attr) & IB_AH_GRH))
+               return -EINVAL;
+
+       grh = rdma_ah_read_grh(init_attr->ah_attr);
+
+       /* Only support RoCE type and unicast DGRAM */
+       if (init_attr->ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)
+               return -EINVAL;
+
+       if (rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw)) {
+               dev_dbg(&vdev->vdev->dev, "Multicast GID not supported in 
AH\n");
+               return -EINVAL;
+       }
+
+       /* Enforce max_ah limit using atomic increment with barrier */
+       if (!atomic_add_unless(&vdev->num_ah, 1, vdev->ib_dev.attrs.max_ah)) {
+               dev_dbg(&vdev->vdev->dev, "Exceeded max number of AHs (%u)\n",
+                       vdev->ib_dev.attrs.max_ah);
+               return -ENOMEM;
+       }
+
+       /* Initialize AV (Address Vector) with relevant fields */
+       ah->av.port = port_num;
+       ah->av.pdn = to_vpd(ibah->pd)->pd_handle;         /* Protection Domain 
Number */
+       ah->av.gid_index = grh->sgid_index;               /* Source GID table 
index */
+       ah->av.hop_limit = grh->hop_limit;
+       ah->av.sl_tclass_flowlabel = (u32)(grh->traffic_class << 20) |
+                                     (grh->flow_label & 0xfffff); /* 8-bit SL 
+ 20-bit flow label */
+
+       memcpy(ah->av.dgid, grh->dgid.raw, sizeof(ah->av.dgid));        /* 
128-bit Dest GID */
+       memcpy(ah->av.dmac, init_attr->ah_attr->roce.dmac, ETH_ALEN);    /* 
Next-hop MAC */
+
+       return 0;
+}
+
+/**
+ * vrdma_destroy_ah - Destroy an Address Handle
+ * @ibah:      The IB address handle to destroy
+ * @flags:     Destroy flags (e.g., for deferred cleanup; unused here)
+ *
+ * This callback releases the software state associated with an AH.
+ * It decrements the per-device AH counter to allow new AH creation.
+ *
+ * No hardware interaction is needed since AHs are purely software constructs
+ * in this virtio-rdma implementation.
+ *
+ * Context: Can sleep (process context). May be called from RCU read-side 
critical section.
+ * Return: Always returns 0 (success).
+ */
+static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+       struct vrdma_dev *vdev = to_vdev(ibah->device);
+
+       atomic_dec(&vdev->num_ah);
+
+       return 0;
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
        .owner = THIS_MODULE,
        .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -1101,6 +1692,15 @@ static const struct ib_device_ops vrdma_dev_ops = {
        .dealloc_pd = vrdma_dealloc_pd,
        .create_qp = vrdma_create_qp,
        .destroy_qp = vrdma_destroy_qp,
+       .get_dma_mr = vrdma_get_dma_mr,
+       .alloc_mr = vrdma_alloc_mr,
+       .dereg_mr = vrdma_dereg_mr,
+       .add_gid = vrdma_add_gid,
+       .del_gid = vrdma_del_gid,
+       .alloc_ucontext = vrdma_alloc_ucontext,
+       .dealloc_ucontext = vrdma_dealloc_ucontext,
+       .create_ah = vrdma_create_ah,
+       .destroy_ah = vrdma_destroy_ah,
 };
 
 /**
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
index ba88599c8..6759c4349 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
@@ -11,6 +11,8 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/vrdma_abi.h>
 
+#include "vrdma_abi.h"
+
 #define VRDMA_COMM_TIMEOUT 1000000
 
 enum vrdma_type {
@@ -130,6 +132,11 @@ struct vrdma_ucontext {
        struct vrdma_dev *dev;
 };
 
+struct vrdma_ah {
+       struct ib_ah ibah;
+       struct vrdma_av av;
+};
+
 /**
  * struct vrdma_qp - Virtual RDMA Queue Pair (QP) private data
  *
@@ -166,6 +173,64 @@ struct vrdma_qp {
        struct vrdma_user_mmap_entry *rq_entry; /* Mmap entry for RQ buffer */
 };
 
+/**
+ * struct vrdma_mr - Software state of a Virtio-RDMA Memory Region (MR)
+ * @ibmr:              InfiniBand core MR object (contains rkey, lkey, etc.)
+ * @umem:              User memory descriptor from ib_umem_get(), holds
+ *                     page list and reference to user VMA
+ * @mr_handle:         Handle returned by backend device for this MR
+ * @iova:              I/O virtual address (start of the mapped region)
+ * @size:              Total size of the memory region in bytes
+ * @pages:             Level 1 (L1) page table - array of kernel pointers to
+ *                     level 2 (L2) page tables containing DMA addresses.
+ *                     This is used by the host driver to manage 
scatter-gather layout.
+ * @pages_k:           Array of kernel virtual addresses of L2 page tables.
+ *                     Used to free memory correctly during cleanup.
+ * @dma_pages:         DMA address of the L1 page table (first-level table),
+ *                     to be passed to the device or written in command WQE.
+ * @npages:            Number of valid pages in the memory region
+ * @max_pages:         Maximum number of pages that can be held in current
+ *                     page table allocation (based on initial mapping size)
+ *
+ * This structure represents a registered memory region in the vRDMA driver.
+ * It supports large memory registrations using a two-level page table design:
+ *
+ *   L1 Page Table (contiguous DMA-mapped):
+ *     Contains pointers to multiple L2 tables (each L2 = one page).
+ *
+ *   L2 Page Tables:
+ *     Each stores up to N DMA addresses (physical page addresses).
+ *
+ * The layout allows efficient hardware access while keeping kernel allocations
+ * manageable for very large mappings (e.g., tens of GB).
+ *
+ * Example layout for 4K pages and 512 entries per L2 table:
+ *
+ *   L1 (dma_pages) -> [L2_0] -> [DMA_ADDR_A, ..., DMA_ADDR_Z]
+ *                   [L2_1] -> [DMA_ADDR_X, ..., DMA_ADDR_Y]
+ *                   ...
+ *
+ * Used during:
+ *   - ib_reg_mr()
+ *   - SEND/WRITE/READ operations with remote access
+ *   - MR invalidation and cleanup in vrdma_dereg_mr()
+ */
+struct vrdma_mr {
+       struct ib_mr ibmr;
+       struct ib_umem *umem;
+
+       u32 mr_handle;
+       u64 iova;
+       u64 size;
+
+       u64 **pages;        /* L1: array of L2 table DMA address pointers */
+       u64 **pages_k;      /* L1: array of L2 table kernel virtual addresses */
+       dma_addr_t dma_pages; /* DMA address of the L1 table itself */
+
+       u32 npages;
+       u32 max_pages;
+};
+
 static inline struct vrdma_cq *to_vcq(struct ib_cq *ibcq)
 {
        return container_of(ibcq, struct vrdma_cq, ibcq);
@@ -181,6 +246,21 @@ static inline struct vrdma_qp *to_vqp(struct ib_qp *ibqp)
        return container_of(ibqp, struct vrdma_qp, ibqp);
 }
 
+static inline struct vrdma_mr *to_vmr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct vrdma_mr, ibmr);
+}
+
+static inline struct vrdma_ucontext *to_vucontext(struct ib_ucontext 
*ibucontext)
+{
+       return container_of(ibucontext, struct vrdma_ucontext, ibucontext);
+}
+
+static inline struct vrdma_ah *to_vah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct vrdma_ah, ibah);
+}
+
 int vrdma_register_ib_device(struct vrdma_dev *vrdev);
 void vrdma_unregister_ib_device(struct vrdma_dev *vrdev);
 
-- 
2.43.0

Reply via email to