From: xiongweimin <[email protected]>

This commit adds support for essential RDMA resource management verbs:

1. P_Key Table Query:
   - Implements IB_QUERY_PKEY verb for partition key retrieval
   - Handles endianness conversion for cross-platform compatibility
   - Provides complete error handling for device communication failures

2. QP Attribute Query:
   - Full QP state retrieval including capabilities and AH attributes
   - Byte order handling for all struct fields
   - Init attribute preservation for consistency checks
   - Detailed error logging for debugging

3. User Memory Registration:
   - Memory pinning via ib_umem_get() with access flag enforcement
   - DMA-safe page table construction and bulk transfer to device
   - Multi-architecture DMA address handling
   - Strict memory boundary validation
   - Resource cleanup guarantees on all error paths

Key enhancements:
- Unified virtqueue command infrastructure
- Cross-architecture endianness handling
- Atomic page table transfer for registered memory regions
- Protection domain integration for memory access control
- Error injection points for robust resource recovery

Signed-off-by: Xiong Weimin <[email protected]>
---
 .../infiniband/hw/virtio/vrdma_dev_api.h      |  35 ++
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 333 +++++++++++++++++-
 2 files changed, 367 insertions(+), 1 deletion(-)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index d0ce02601..86b5ecade 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -225,6 +225,41 @@ struct vrdma_rsp_modify_qp {
     __u32 qpn;
 };
 
+struct vrdma_cmd_query_pkey {
+       __u32 port;
+       __u16 index;
+};
+
+struct vrdma_rsp_query_pkey {
+       __u16 pkey;
+};
+
+struct vrdma_cmd_query_qp {
+       __u32 qpn;
+       __u32 attr_mask;
+};
+
+struct vrdma_rsp_query_qp {
+       struct vrdma_qp_attr attr;
+};
+
+struct vrdma_cmd_reg_user_mr {
+       __u32 pdn;
+       __u32 access_flags;
+       __u64 start;
+       __u64 length;
+       __u64 virt_addr;
+
+       __u64 pages;
+       __u32 npages;
+};
+
+struct vrdma_rsp_reg_user_mr {
+       __u32 mrn;
+       __u32 lkey;
+       __u32 rkey;
+};
+
 #define VRDMA_CTRL_OK  0
 #define VRDMA_CTRL_ERR 1
 
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index f9b129774..b1429e072 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -23,6 +23,7 @@
 #include "vrdma_queue.h"
 
 #define VRTIO_RDMA_PAGE_PER_TBL 512
+#define VRDMA_MAX_PAGES         512 * 512
 
 /**
  * cmd_str - String representation of virtio RDMA control commands
@@ -86,6 +87,36 @@ static void rdma_ah_attr_to_vrdma(struct vrdma_ah_attr *dst,
        memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr));
 }
 
+static void vrdma_to_ib_global_route(struct ib_global_route *dst,
+                              const struct vrdma_global_route *src)
+{
+       dst->dgid = src->dgid;
+       dst->flow_label = src->flow_label;
+       dst->sgid_index = src->sgid_index;
+       dst->hop_limit = src->hop_limit;
+       dst->traffic_class = src->traffic_class;
+}
+
+static void vrdma_to_ib_qp_cap(struct ib_qp_cap *dst, const struct 
vrdma_qp_cap *src)
+{
+       dst->max_send_wr = src->max_send_wr;
+       dst->max_recv_wr = src->max_recv_wr;
+       dst->max_send_sge = src->max_send_sge;
+       dst->max_recv_sge = src->max_recv_sge;
+       dst->max_inline_data = src->max_inline_data;
+}
+
+static void vrdma_to_rdma_ah_attr(struct rdma_ah_attr *dst,
+                           const struct vrdma_ah_attr *src)
+{
+       vrdma_to_ib_global_route(rdma_ah_retrieve_grh(dst), &src->grh);
+       rdma_ah_set_sl(dst, src->sl);
+       rdma_ah_set_static_rate(dst, src->static_rate);
+       rdma_ah_set_port_num(dst, src->port_num);
+       rdma_ah_set_ah_flags(dst, src->ah_flags);
+       memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr));
+}
+
 /**
  * vrdma_exec_verbs_cmd - Execute a verbs command via control virtqueue
  * @vrdev: VRDMA device
@@ -2521,6 +2552,303 @@ static int vrdma_post_send(struct ib_qp *ibqp, const 
struct ib_send_wr *wr,
        return rc;
 }
 
+/**
+ * vrdma_query_pkey - Query Partition Key (P_Key) at given index
+ * @ibdev:     Verbs device (vRDMA virtual device)
+ * @port:      Port number (1-indexed)
+ * @index:     P_Key table index
+ * @pkey:      Output buffer to store the P_Key value
+ *
+ * Queries the P_Key from the backend via virtqueue command.
+ * Only meaningful for IB-style ports (not RoCE).
+ *
+ * Context: Process context (may sleep). Can be called from user IOCTL path.
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if command allocation fails
+ * * -EIO or other negative errno on communication failure
+ */
+static int vrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 
*pkey)
+{
+       struct vrdma_dev *vdev = to_vdev(ibdev);
+       struct vrdma_cmd_query_pkey *cmd;
+       struct vrdma_rsp_query_pkey *rsp;
+       struct scatterlist in, out;
+       int rc;
+
+       /* Allocate command and response buffers */
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd)
+               return -ENOMEM;
+
+       rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+       if (!rsp) {
+               kfree(cmd);
+               return -ENOMEM;
+       }
+
+       /* Fill input parameters */
+       cmd->port = cpu_to_le32(port);
+       cmd->index = cpu_to_le16(index);
+
+       /* Prepare scatterlists for virtqueue I/O */
+       sg_init_one(&in, cmd, sizeof(*cmd));
+       sg_init_one(&out, rsp, sizeof(*rsp));
+
+       /* Execute command */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_PKEY, &in, &out);
+       if (rc) {
+               dev_err(&vdev->vdev->dev,
+                       "VIRTIO_RDMA_CMD_QUERY_PKEY failed: port=%u idx=%u 
err=%d\n",
+                       port, index, rc);
+               goto out_free;
+       }
+
+       /* Copy result to user */
+       *pkey = le16_to_cpu(rsp->pkey);
+
+out_free:
+       kfree(rsp);
+       kfree(cmd);
+       return rc;
+}
+
+/**
+ * vrdma_query_qp - Query QP attributes from the backend
+ * @ibqp:      Queue pair to query
+ * @attr:      Output structure for QP attributes
+ * @attr_mask: Which fields are requested (ignored by some backends)
+ * @init_attr: Output structure for init-time attributes
+ *
+ * Queries the QP state and configuration via a control virtqueue command.
+ * This is a synchronous operation.
+ *
+ * Context: Process context (can sleep)
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if allocation fails
+ * * -EIO or other negative errno on communication failure
+ */
+static int vrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                         int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+       struct vrdma_qp *vqp = to_vqp(ibqp);
+       struct vrdma_dev *vdev = to_vdev(ibqp->device);
+       struct vrdma_cmd_query_qp *cmd;
+       struct vrdma_rsp_query_qp *rsp;
+       struct scatterlist in, out;
+       int rc;
+
+       /* Allocate command and response buffers */
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd)
+               return -ENOMEM;
+
+       rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+       if (!rsp) {
+               kfree(cmd);
+               return -ENOMEM;
+       }
+
+       /* Fill input parameters */
+       cmd->qpn = cpu_to_le32(vqp->qp_handle);
+       cmd->attr_mask = cpu_to_le32(attr_mask); /* Optional optimization */
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+       sg_init_one(&out, rsp, sizeof(*rsp));
+
+       /* Execute command over control virtqueue */
+       rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_QP, &in, &out);
+       if (rc) {
+               dev_err(&vdev->vdev->dev,
+                       "VIRTIO_RDMA_CMD_QUERY_QP failed: qpn=0x%x err=%d\n",
+                       vqp->qp_handle, rc);
+               goto out_free;
+       }
+
+       /* Only copy results on success */
+       attr->qp_state = rsp->attr.qp_state;
+       attr->cur_qp_state = rsp->attr.cur_qp_state;
+       attr->path_mtu = rsp->attr.path_mtu;
+       attr->path_mig_state = rsp->attr.path_mig_state;
+       attr->qkey = le32_to_cpu(rsp->attr.qkey);
+       attr->rq_psn = le32_to_cpu(rsp->attr.rq_psn);
+       attr->sq_psn = le32_to_cpu(rsp->attr.sq_psn);
+       attr->dest_qp_num = le32_to_cpu(rsp->attr.dest_qp_num);
+       attr->qp_access_flags = le32_to_cpu(rsp->attr.qp_access_flags);
+       attr->pkey_index = le16_to_cpu(rsp->attr.pkey_index);
+       attr->alt_pkey_index = le16_to_cpu(rsp->attr.alt_pkey_index);
+       attr->en_sqd_async_notify = rsp->attr.en_sqd_async_notify;
+       attr->sq_draining = rsp->attr.sq_draining;
+       attr->max_rd_atomic = rsp->attr.max_rd_atomic;
+       attr->max_dest_rd_atomic = rsp->attr.max_dest_rd_atomic;
+       attr->min_rnr_timer = rsp->attr.min_rnr_timer;
+       attr->port_num = rsp->attr.port_num;
+       attr->timeout = rsp->attr.timeout;
+       attr->retry_cnt = rsp->attr.retry_cnt;
+       attr->rnr_retry = rsp->attr.rnr_retry;
+       attr->alt_port_num = rsp->attr.alt_port_num;
+       attr->alt_timeout = rsp->attr.alt_timeout;
+       attr->rate_limit = le32_to_cpu(rsp->attr.rate_limit);
+
+       /* Copy capabilities */
+       vrdma_to_ib_qp_cap(&attr->cap, &rsp->attr.cap);
+
+       /* Convert AH attributes (contains GRH + DIP) */
+       vrdma_to_rdma_ah_attr(&attr->ah_attr, &rsp->attr.ah_attr);
+       vrdma_to_rdma_ah_attr(&attr->alt_ah_attr, &rsp->attr.alt_ah_attr);
+
+       /* Fill init attributes (mostly static) */
+       init_attr->event_handler = vqp->ibqp.event_handler;
+       init_attr->qp_context = vqp->ibqp.qp_context;
+       init_attr->send_cq = vqp->ibqp.send_cq;
+       init_attr->recv_cq = vqp->ibqp.recv_cq;
+       init_attr->srq = vqp->ibqp.srq;
+       init_attr->xrcd = NULL; /* Not supported in vRDMA */
+       init_attr->cap = attr->cap;
+       init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; /* Or driver default */
+       init_attr->qp_type = vqp->ibqp.qp_type;
+       init_attr->create_flags = 0;
+       init_attr->port_num = vqp->port;
+
+out_free:
+       kfree(rsp);
+       kfree(cmd);
+       return rc;
+}
+
+/**
+ * vrdma_reg_user_mr - Register a user memory region
+ * @pd:                Protection domain
+ * @start:     User virtual address of memory to register
+ * @length:    Length of memory region
+ * virt_addr:  Optional virtual address for rkey access (often same as start)
+ * @access_flags: Access permissions (IB_ACCESS_xxx)
+ * @udata:     User data (optional, unused here)
+ *
+ * Locks down user pages, builds page table, and registers MR with backend.
+ * Returns pointer to ib_mr or ERR_PTR on failure.
+ *
+ * Context: Process context (may sleep during ib_umem_get)
+ * Return:
+ * * Pointer to &mr->ibmr on success
+ * * ERR_PTR(-errno) on failure
+ */
+static struct ib_mr *vrdma_reg_user_mr(struct ib_pd *pd, u64 start,
+                                            u64 length, u64 virt_addr,
+                                            int access_flags,
+                                            struct ib_udata *udata)
+{
+       struct vrdma_dev *dev = to_vdev(pd->device);
+       struct vrdma_cmd_reg_user_mr *cmd;
+       struct vrdma_rsp_reg_user_mr *rsp;
+       struct vrdma_mr *mr;
+       struct ib_umem *umem;
+       struct sg_dma_page_iter sg_iter;
+       struct scatterlist in, out;
+       int rc = 0;
+       unsigned npages;
+       dma_addr_t *pages_flat = NULL;
+
+       /* Step 1: Pin user memory pages */
+       umem = ib_umem_get(pd->device, start, length, access_flags);
+       if (IS_ERR(umem)) {
+               dev_err(&dev->vdev->dev, "Failed to pin user memory: va=0x%llx 
len=%llu\n",
+                       start, length);
+               return ERR_CAST(umem);
+       }
+
+       npages = ib_umem_num_pages(umem);
+       if (npages == 0 || npages > VRDMA_MAX_PAGES) { // e.g., VRDMA_MAX_PAGES 
= 512*512
+               dev_err(&dev->vdev->dev, "Invalid number of pages: %u\n", 
npages);
+               rc = -EINVAL;
+               goto err_umem;
+       }
+
+       /* Allocate command/response structures (GFP_KERNEL ok in process 
context) */
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!cmd || !rsp || !mr) {
+               rc = -ENOMEM;
+               goto err_alloc;
+       }
+
+       /* Initialize MR structure */
+       mr->umem = umem;
+       mr->size = length;
+       mr->iova = virt_addr;
+       mr->max_pages = npages;
+
+       /* Allocate contiguous DMA-mapped array for page addresses */
+       pages_flat = dma_alloc_coherent(&dev->vdev->dev,
+                                       npages * sizeof(dma_addr_t),
+                                       &mr->dma_pages, GFP_KERNEL);
+       if (!pages_flat) {
+               dev_err(&dev->vdev->dev, "Failed to allocate DMA memory for 
page table\n");
+               rc = -ENOMEM;
+               goto err_alloc;
+       }
+       mr->pages_k = &pages_flat; /* Treat as 2D: [i/512][i%512] */
+
+       /* Fill page table from ib_umem scatterlist */
+       mr->npages = 0;
+       for_each_sg_dma_page(umem->sgt_append.sgt.sgl, &sg_iter, 
umem->sgt_append.sgt.nents, 0) {
+               dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
+               pages_flat[mr->npages++] = addr;
+       }
+
+       /* Sanity check: should match ib_umem_num_pages() */
+       WARN_ON(mr->npages != npages);
+
+       /* Prepare command */
+       cmd->pdn = cpu_to_le32(to_vpd(pd)->pd_handle);
+       cmd->start = cpu_to_le64(start);
+       cmd->length = cpu_to_le64(length);
+       cmd->virt_addr = cpu_to_le64(virt_addr);
+       cmd->access_flags = cpu_to_le32(access_flags);
+       cmd->pages = cpu_to_le64(mr->dma_pages); /* DMA address of page array */
+       cmd->npages = cpu_to_le32(npages);
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+       sg_init_one(&out, rsp, sizeof(*rsp));
+
+       /* Send command to backend */
+       rc = vrdma_exec_verbs_cmd(dev, VIRTIO_RDMA_CMD_REG_USER_MR, &in, &out);
+       if (rc) {
+               dev_err(&dev->vdev->dev, "Backend failed to register MR: %d\n", 
rc);
+               goto err_cmd;
+       }
+
+       /* Copy results from response */
+       mr->mr_handle = le32_to_cpu(rsp->mrn);
+       mr->ibmr.lkey = le32_to_cpu(rsp->lkey);
+       mr->ibmr.rkey = le32_to_cpu(rsp->rkey);
+
+       /* Cleanup temporary allocations */
+       kfree(cmd);
+       kfree(rsp);
+
+       /* Link MR to PD if needed, initialize other fields */
+       mr->ibmr.pd = pd;
+       mr->ibmr.device = pd->device;
+       mr->ibmr.type = IB_MR_TYPE_MEM_REG;
+       mr->ibmr.length = length;
+
+       return &mr->ibmr;
+
+err_cmd:
+       dma_free_coherent(&dev->vdev->dev, npages * sizeof(dma_addr_t),
+                         pages_flat, mr->dma_pages);
+err_alloc:
+       kfree(mr);
+       kfree(rsp);
+       kfree(cmd);
+err_umem:
+       ib_umem_release(umem);
+       return ERR_PTR(rc);
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
        .owner = THIS_MODULE,
        .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -2554,7 +2882,10 @@ static const struct ib_device_ops vrdma_dev_ops = {
        .modify_qp = vrdma_modify_qp,
        .poll_cq = vrdma_poll_cq,
        .post_recv = vrdma_post_recv,
-       .post_send = vrdma_post_send,                   
+       .post_send = vrdma_post_send,
+       .query_pkey = vrdma_query_pkey,
+       .query_qp = vrdma_query_qp,
+       .reg_user_mr = vrdma_reg_user_mr,                       
 };
 
 /**
-- 
2.43.0

Reply via email to